/*
 * Copyright (c) 2019-2020 Arm Limited.
 *
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to
 * deal in the Software without restriction, including without limitation the
 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 * sell copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */
#ifdef __aarch64__

#include <algorithm>

#include "arm_gemm.hpp"


#include "../../asmlib.hpp"
#include "../../utils.hpp"

namespace arm_gemm {

void a64_smallK_hybrid_fp32_mla_8x4(const float *A, int lda, const float *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool) {
    const long loops_count = iceildiv(N, (int)4) - 1;
    const long ldab = lda * sizeof(float);
    const long ldcb = ldc * sizeof(float);
    float nullbias[4];
    if (!bias) {
        memset(nullbias, 0, (4 * sizeof(float)));
    }
    float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
    float maxval =   static_cast<float>(std::numeric_limits<float>::infinity());
    const float * const minptr = &minval;
    const float * const maxptr = &maxval;

    switch(act.type)
    {
        default:
        case Activation::Type::None:
            break;
        case Activation::Type::BoundedReLU:
            maxval = static_cast<float>(act.param1);
            /* fall through */
        case Activation::Type::ReLU:
            minval = 0.0f;
            break;
    }

    for (int y0=0; y0<M; y0+=8) {
        long loops = loops_count;
        long oob_rows = std::max(8 - (M-y0), 0);
        const float *b_ptr0 = B;
        const float *biasptr = bias ? bias : nullbias;
        const uint64_t biasinc = bias ? 4*sizeof(float) : 0;
        const float *a_ptr0 = A + (y0 * lda);

        float *c_ptr0 = C + (y0 * ldc);

        switch(K) {
            case 1:
                __asm __volatile (
                    "a_ptr1 .req X0\n"
                    "a_ptr2 .req X1\n"
                    "a_ptr3 .req X2\n"
                    "a_ptr4 .req X3\n"
                    "a_ptr5 .req X4\n"
                    "a_ptr6 .req X5\n"
                    "a_ptr7 .req X6\n"
                    "c_ptr1 .req X7\n"
                    "c_ptr2 .req X8\n"
                    "c_ptr3 .req X9\n"
                    "c_ptr4 .req X10\n"
                    "c_ptr5 .req X11\n"
                    "c_ptr6 .req X12\n"
                    "c_ptr7 .req X13\n"
                    "add a_ptr1, %[a_ptr0], %[lda]\n"
                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
                    "add a_ptr2, a_ptr1, %[lda]\n"
                    "add c_ptr2, c_ptr1, %[ldc]\n"
                    "add a_ptr3, a_ptr2, %[lda]\n"
                    "add c_ptr3, c_ptr2, %[ldc]\n"
                    "add a_ptr4, a_ptr3, %[lda]\n"
                    "add c_ptr4, c_ptr3, %[ldc]\n"
                    "add a_ptr5, a_ptr4, %[lda]\n"
                    "add c_ptr5, c_ptr4, %[ldc]\n"
                    "add a_ptr6, a_ptr5, %[lda]\n"
                    "add c_ptr6, c_ptr5, %[ldc]\n"
                    "add a_ptr7, a_ptr6, %[lda]\n"
                    "add c_ptr7, c_ptr6, %[ldc]\n"
                    "cbz %[oob_rows], 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr7, %[c_ptr0], #0x0\n"
                    "add a_ptr7, %[a_ptr0], #0x0\n"
                    "b.eq 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr6, %[c_ptr0], #0x0\n"
                    "add a_ptr6, %[a_ptr0], #0x0\n"
                    "b.eq 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr5, %[c_ptr0], #0x0\n"
                    "add a_ptr5, %[a_ptr0], #0x0\n"
                    "b.eq 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr4, %[c_ptr0], #0x0\n"
                    "add a_ptr4, %[a_ptr0], #0x0\n"
                    "b.eq 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr3, %[c_ptr0], #0x0\n"
                    "add a_ptr3, %[a_ptr0], #0x0\n"
                    "b.eq 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr2, %[c_ptr0], #0x0\n"
                    "add a_ptr2, %[a_ptr0], #0x0\n"
                    "b.eq 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr1, %[c_ptr0], #0x0\n"
                    "add a_ptr1, %[a_ptr0], #0x0\n"
                    "1:\n"
                    "ldr q16, [%[b_ptr0]]\n"
                    "ldr s0, [%[a_ptr0]]\n"
                    "ldr s1, [a_ptr1]\n"
                    "ldr s2, [a_ptr2]\n"
                    "ldr s3, [a_ptr3]\n"
                    "ldr s4, [a_ptr4]\n"
                    "ldr s5, [a_ptr5]\n"
                    "ldr s6, [a_ptr6]\n"
                    "ldr s7, [a_ptr7]\n"
                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
                    "cbz %[loops], 2f\n"
                    "ldr q24, [%[biasptr]]\n"
                    "add %[biasptr], %[biasptr], %[biasinc]\n"
                    "subs %[loops], %[loops], #0x1\n"
                    "mov v25.16b, v24.16b\n"
                    "mov v26.16b, v24.16b\n"
                    "mov v27.16b, v24.16b\n"
                    "mov v28.16b, v24.16b\n"
                    "mov v29.16b, v24.16b\n"
                    "mov v30.16b, v24.16b\n"
                    "mov v31.16b, v24.16b\n"
                    "fmla v24.4s, v16.4s, v0.s[0]\n"
                    "fmla v25.4s, v16.4s, v1.s[0]\n"
                    "fmla v26.4s, v16.4s, v2.s[0]\n"
                    "fmla v27.4s, v16.4s, v3.s[0]\n"
                    "fmla v28.4s, v16.4s, v4.s[0]\n"
                    "fmla v29.4s, v16.4s, v5.s[0]\n"
                    "fmla v30.4s, v16.4s, v6.s[0]\n"
                    "fmla v31.4s, v16.4s, v7.s[0]\n"
                    "b.eq 3f\n"
                    "4:\n"
                    "ld1r {v22.4s}, [%[minptr]]\n"
                    "subs %[loops], %[loops], #0x1\n"
                    "ld1r {v23.4s}, [%[maxptr]]\n"
                    "ldr q16, [%[b_ptr0]]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
                    "fmax v24.4s, v24.4s, v22.4s\n"
                    "fmax v25.4s, v25.4s, v22.4s\n"
                    "fmax v26.4s, v26.4s, v22.4s\n"
                    "fmax v27.4s, v27.4s, v22.4s\n"
                    "fmin v24.4s, v24.4s, v23.4s\n"
                    "fmin v25.4s, v25.4s, v23.4s\n"
                    "fmin v26.4s, v26.4s, v23.4s\n"
                    "fmin v27.4s, v27.4s, v23.4s\n"
                    "str q24, [%[c_ptr0]]\n"
                    "fmax v28.4s, v28.4s, v22.4s\n"
                    "ldr q24, [%[biasptr]]\n"
                    "fmax v29.4s, v29.4s, v22.4s\n"
                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
                    "fmax v30.4s, v30.4s, v22.4s\n"
                    "str q25, [c_ptr1]\n"
                    "fmin v28.4s, v28.4s, v23.4s\n"
                    "add c_ptr1, c_ptr1, #0x10\n"
                    "fmin v29.4s, v29.4s, v23.4s\n"
                    "str q26, [c_ptr2]\n"
                    "fmin v30.4s, v30.4s, v23.4s\n"
                    "add c_ptr2, c_ptr2, #0x10\n"
                    "fmax v31.4s, v31.4s, v22.4s\n"
                    "str q27, [c_ptr3]\n"
                    "mov v25.16b, v24.16b\n"
                    "add c_ptr3, c_ptr3, #0x10\n"
                    "mov v26.16b, v24.16b\n"
                    "str q28, [c_ptr4]\n"
                    "fmin v31.4s, v31.4s, v23.4s\n"
                    "add c_ptr4, c_ptr4, #0x10\n"
                    "mov v27.16b, v24.16b\n"
                    "str q29, [c_ptr5]\n"
                    "mov v28.16b, v24.16b\n"
                    "add c_ptr5, c_ptr5, #0x10\n"
                    "mov v29.16b, v24.16b\n"
                    "str q30, [c_ptr6]\n"
                    "mov v30.16b, v24.16b\n"
                    "add c_ptr6, c_ptr6, #0x10\n"
                    "fmla v25.4s, v16.4s, v1.s[0]\n"
                    "str q31, [c_ptr7]\n"
                    "mov v31.16b, v24.16b\n"
                    "add c_ptr7, c_ptr7, #0x10\n"
                    "fmla v24.4s, v16.4s, v0.s[0]\n"
                    "add %[biasptr], %[biasptr], %[biasinc]\n"
                    "fmla v26.4s, v16.4s, v2.s[0]\n"
                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                    "fmla v27.4s, v16.4s, v3.s[0]\n"
                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                    "fmla v28.4s, v16.4s, v4.s[0]\n"
                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                    "fmla v29.4s, v16.4s, v5.s[0]\n"
                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                    "fmla v30.4s, v16.4s, v6.s[0]\n"
                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                    "fmla v31.4s, v16.4s, v7.s[0]\n"
                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                    "b.ne 4b\n"
                    "3:\n"
                    "ld1r {v22.4s}, [%[minptr]]\n"
                    "ld1r {v23.4s}, [%[maxptr]]\n"
                    "ldr q16, [%[b_ptr0]]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x10\n"
                    "fmax v24.4s, v24.4s, v22.4s\n"
                    "fmax v25.4s, v25.4s, v22.4s\n"
                    "fmax v26.4s, v26.4s, v22.4s\n"
                    "fmax v27.4s, v27.4s, v22.4s\n"
                    "fmin v24.4s, v24.4s, v23.4s\n"
                    "fmin v25.4s, v25.4s, v23.4s\n"
                    "fmin v26.4s, v26.4s, v23.4s\n"
                    "fmin v27.4s, v27.4s, v23.4s\n"
                    "str q24, [%[c_ptr0]]\n"
                    "fmax v28.4s, v28.4s, v22.4s\n"
                    "ldr q24, [%[biasptr]]\n"
                    "fmax v29.4s, v29.4s, v22.4s\n"
                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
                    "fmax v30.4s, v30.4s, v22.4s\n"
                    "str q25, [c_ptr1]\n"
                    "fmin v28.4s, v28.4s, v23.4s\n"
                    "add c_ptr1, c_ptr1, #0x10\n"
                    "fmin v29.4s, v29.4s, v23.4s\n"
                    "str q26, [c_ptr2]\n"
                    "fmin v30.4s, v30.4s, v23.4s\n"
                    "add c_ptr2, c_ptr2, #0x10\n"
                    "fmax v31.4s, v31.4s, v22.4s\n"
                    "str q27, [c_ptr3]\n"
                    "mov v25.16b, v24.16b\n"
                    "add c_ptr3, c_ptr3, #0x10\n"
                    "mov v26.16b, v24.16b\n"
                    "str q28, [c_ptr4]\n"
                    "fmin v31.4s, v31.4s, v23.4s\n"
                    "add c_ptr4, c_ptr4, #0x10\n"
                    "mov v27.16b, v24.16b\n"
                    "str q29, [c_ptr5]\n"
                    "mov v28.16b, v24.16b\n"
                    "add c_ptr5, c_ptr5, #0x10\n"
                    "mov v29.16b, v24.16b\n"
                    "str q30, [c_ptr6]\n"
                    "mov v30.16b, v24.16b\n"
                    "add c_ptr6, c_ptr6, #0x10\n"
                    "fmla v25.4s, v16.4s, v1.s[0]\n"
                    "str q31, [c_ptr7]\n"
                    "mov v31.16b, v24.16b\n"
                    "add c_ptr7, c_ptr7, #0x10\n"
                    "fmla v24.4s, v16.4s, v0.s[0]\n"
                    "add %[biasptr], %[biasptr], %[biasinc]\n"
                    "fmla v26.4s, v16.4s, v2.s[0]\n"
                    "fmla v27.4s, v16.4s, v3.s[0]\n"
                    "fmla v28.4s, v16.4s, v4.s[0]\n"
                    "fmla v29.4s, v16.4s, v5.s[0]\n"
                    "fmla v30.4s, v16.4s, v6.s[0]\n"
                    "fmla v31.4s, v16.4s, v7.s[0]\n"
                    "b 5f\n"
                    "2:\n"
                    "ldr q24, [%[biasptr]]\n"
                    "add %[biasptr], %[biasptr], %[biasinc]\n"
                    "mov v25.16b, v24.16b\n"
                    "mov v26.16b, v24.16b\n"
                    "mov v27.16b, v24.16b\n"
                    "mov v28.16b, v24.16b\n"
                    "mov v29.16b, v24.16b\n"
                    "mov v30.16b, v24.16b\n"
                    "mov v31.16b, v24.16b\n"
                    "fmla v24.4s, v16.4s, v0.s[0]\n"
                    "fmla v25.4s, v16.4s, v1.s[0]\n"
                    "fmla v26.4s, v16.4s, v2.s[0]\n"
                    "fmla v27.4s, v16.4s, v3.s[0]\n"
                    "fmla v28.4s, v16.4s, v4.s[0]\n"
                    "fmla v29.4s, v16.4s, v5.s[0]\n"
                    "fmla v30.4s, v16.4s, v6.s[0]\n"
                    "fmla v31.4s, v16.4s, v7.s[0]\n"
                    "5:\n"
                    "ld1r {v22.4s}, [%[minptr]]\n"
                    "ld1r {v23.4s}, [%[maxptr]]\n"
                    "fmax v24.4s, v24.4s, v22.4s\n"
                    "fmax v25.4s, v25.4s, v22.4s\n"
                    "fmax v26.4s, v26.4s, v22.4s\n"
                    "fmax v27.4s, v27.4s, v22.4s\n"
                    "fmin v24.4s, v24.4s, v23.4s\n"
                    "fmin v25.4s, v25.4s, v23.4s\n"
                    "fmin v26.4s, v26.4s, v23.4s\n"
                    "fmin v27.4s, v27.4s, v23.4s\n"
                    "str q24, [%[c_ptr0]]\n"
                    "fmax v28.4s, v28.4s, v22.4s\n"
                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
                    "fmax v29.4s, v29.4s, v22.4s\n"
                    "str q25, [c_ptr1]\n"
                    "fmax v30.4s, v30.4s, v22.4s\n"
                    "fmin v28.4s, v28.4s, v23.4s\n"
                    "fmax v31.4s, v31.4s, v22.4s\n"
                    "str q26, [c_ptr2]\n"
                    "fmin v29.4s, v29.4s, v23.4s\n"
                    "fmin v30.4s, v30.4s, v23.4s\n"
                    "fmin v31.4s, v31.4s, v23.4s\n"
                    "str q27, [c_ptr3]\n"
                    "str q28, [c_ptr4]\n"
                    "str q29, [c_ptr5]\n"
                    "str q30, [c_ptr6]\n"
                    "str q31, [c_ptr7]\n"
                    ".unreq a_ptr1\n"
                    ".unreq a_ptr2\n"
                    ".unreq a_ptr3\n"
                    ".unreq a_ptr4\n"
                    ".unreq a_ptr5\n"
                    ".unreq a_ptr6\n"
                    ".unreq a_ptr7\n"
                    ".unreq c_ptr1\n"
                    ".unreq c_ptr2\n"
                    ".unreq c_ptr3\n"
                    ".unreq c_ptr4\n"
                    ".unreq c_ptr5\n"
                    ".unreq c_ptr6\n"
                    ".unreq c_ptr7\n"
                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [biasptr] "+r" (biasptr)
                    : [lda] "r" (ldab), [ldc] "r" (ldcb), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
                );
                break;
            case 2:
                __asm __volatile (
                    "a_ptr1 .req X0\n"
                    "a_ptr2 .req X1\n"
                    "a_ptr3 .req X2\n"
                    "a_ptr4 .req X3\n"
                    "a_ptr5 .req X4\n"
                    "a_ptr6 .req X5\n"
                    "a_ptr7 .req X6\n"
                    "c_ptr1 .req X7\n"
                    "c_ptr2 .req X8\n"
                    "c_ptr3 .req X9\n"
                    "c_ptr4 .req X10\n"
                    "c_ptr5 .req X11\n"
                    "c_ptr6 .req X12\n"
                    "c_ptr7 .req X13\n"
                    "add a_ptr1, %[a_ptr0], %[lda]\n"
                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
                    "add a_ptr2, a_ptr1, %[lda]\n"
                    "add c_ptr2, c_ptr1, %[ldc]\n"
                    "add a_ptr3, a_ptr2, %[lda]\n"
                    "add c_ptr3, c_ptr2, %[ldc]\n"
                    "add a_ptr4, a_ptr3, %[lda]\n"
                    "add c_ptr4, c_ptr3, %[ldc]\n"
                    "add a_ptr5, a_ptr4, %[lda]\n"
                    "add c_ptr5, c_ptr4, %[ldc]\n"
                    "add a_ptr6, a_ptr5, %[lda]\n"
                    "add c_ptr6, c_ptr5, %[ldc]\n"
                    "add a_ptr7, a_ptr6, %[lda]\n"
                    "add c_ptr7, c_ptr6, %[ldc]\n"
                    "cbz %[oob_rows], 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr7, %[c_ptr0], #0x0\n"
                    "add a_ptr7, %[a_ptr0], #0x0\n"
                    "b.eq 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr6, %[c_ptr0], #0x0\n"
                    "add a_ptr6, %[a_ptr0], #0x0\n"
                    "b.eq 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr5, %[c_ptr0], #0x0\n"
                    "add a_ptr5, %[a_ptr0], #0x0\n"
                    "b.eq 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr4, %[c_ptr0], #0x0\n"
                    "add a_ptr4, %[a_ptr0], #0x0\n"
                    "b.eq 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr3, %[c_ptr0], #0x0\n"
                    "add a_ptr3, %[a_ptr0], #0x0\n"
                    "b.eq 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr2, %[c_ptr0], #0x0\n"
                    "add a_ptr2, %[a_ptr0], #0x0\n"
                    "b.eq 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr1, %[c_ptr0], #0x0\n"
                    "add a_ptr1, %[a_ptr0], #0x0\n"
                    "1:\n"
                    "ldr q16, [%[b_ptr0]]\n"
                    "ldr d0, [%[a_ptr0]]\n"
                    "ldr q17, [%[b_ptr0], #0x10]\n"
                    "ldr d1, [a_ptr1]\n"
                    "ldr d2, [a_ptr2]\n"
                    "ldr d3, [a_ptr3]\n"
                    "ldr d4, [a_ptr4]\n"
                    "ldr d5, [a_ptr5]\n"
                    "ldr d6, [a_ptr6]\n"
                    "ldr d7, [a_ptr7]\n"
                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "cbz %[loops], 2f\n"
                    "ldr q24, [%[biasptr]]\n"
                    "add %[biasptr], %[biasptr], %[biasinc]\n"
                    "subs %[loops], %[loops], #0x1\n"
                    "mov v25.16b, v24.16b\n"
                    "mov v26.16b, v24.16b\n"
                    "mov v27.16b, v24.16b\n"
                    "mov v28.16b, v24.16b\n"
                    "mov v29.16b, v24.16b\n"
                    "mov v30.16b, v24.16b\n"
                    "mov v31.16b, v24.16b\n"
                    "fmla v24.4s, v16.4s, v0.s[0]\n"
                    "fmla v25.4s, v16.4s, v1.s[0]\n"
                    "fmla v26.4s, v16.4s, v2.s[0]\n"
                    "fmla v27.4s, v16.4s, v3.s[0]\n"
                    "fmla v28.4s, v16.4s, v4.s[0]\n"
                    "fmla v29.4s, v16.4s, v5.s[0]\n"
                    "fmla v30.4s, v16.4s, v6.s[0]\n"
                    "fmla v31.4s, v16.4s, v7.s[0]\n"
                    "fmla v24.4s, v17.4s, v0.s[1]\n"
                    "fmla v25.4s, v17.4s, v1.s[1]\n"
                    "fmla v26.4s, v17.4s, v2.s[1]\n"
                    "fmla v27.4s, v17.4s, v3.s[1]\n"
                    "fmla v28.4s, v17.4s, v4.s[1]\n"
                    "fmla v29.4s, v17.4s, v5.s[1]\n"
                    "fmla v30.4s, v17.4s, v6.s[1]\n"
                    "fmla v31.4s, v17.4s, v7.s[1]\n"
                    "b.eq 3f\n"
                    "4:\n"
                    "ld1r {v22.4s}, [%[minptr]]\n"
                    "subs %[loops], %[loops], #0x1\n"
                    "ld1r {v23.4s}, [%[maxptr]]\n"
                    "ldr q16, [%[b_ptr0]]\n"
                    "fmax v24.4s, v24.4s, v22.4s\n"
                    "ldr q17, [%[b_ptr0], #0x10]\n"
                    "fmax v25.4s, v25.4s, v22.4s\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmax v26.4s, v26.4s, v22.4s\n"
                    "fmin v24.4s, v24.4s, v23.4s\n"
                    "fmin v25.4s, v25.4s, v23.4s\n"
                    "fmax v27.4s, v27.4s, v22.4s\n"
                    "fmin v26.4s, v26.4s, v23.4s\n"
                    "str q24, [%[c_ptr0]]\n"
                    "fmax v28.4s, v28.4s, v22.4s\n"
                    "ldr q24, [%[biasptr]]\n"
                    "fmax v29.4s, v29.4s, v22.4s\n"
                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
                    "fmin v27.4s, v27.4s, v23.4s\n"
                    "str q25, [c_ptr1]\n"
                    "fmin v28.4s, v28.4s, v23.4s\n"
                    "add c_ptr1, c_ptr1, #0x10\n"
                    "fmin v29.4s, v29.4s, v23.4s\n"
                    "str q26, [c_ptr2]\n"
                    "fmax v30.4s, v30.4s, v22.4s\n"
                    "add c_ptr2, c_ptr2, #0x10\n"
                    "fmax v31.4s, v31.4s, v22.4s\n"
                    "str q27, [c_ptr3]\n"
                    "mov v25.16b, v24.16b\n"
                    "add c_ptr3, c_ptr3, #0x10\n"
                    "fmin v30.4s, v30.4s, v23.4s\n"
                    "str q28, [c_ptr4]\n"
                    "fmin v31.4s, v31.4s, v23.4s\n"
                    "add c_ptr4, c_ptr4, #0x10\n"
                    "mov v26.16b, v24.16b\n"
                    "str q29, [c_ptr5]\n"
                    "mov v27.16b, v24.16b\n"
                    "add c_ptr5, c_ptr5, #0x10\n"
                    "mov v28.16b, v24.16b\n"
                    "str q30, [c_ptr6]\n"
                    "mov v29.16b, v24.16b\n"
                    "add c_ptr6, c_ptr6, #0x10\n"
                    "mov v30.16b, v24.16b\n"
                    "str q31, [c_ptr7]\n"
                    "mov v31.16b, v24.16b\n"
                    "add c_ptr7, c_ptr7, #0x10\n"
                    "fmla v24.4s, v16.4s, v0.s[0]\n"
                    "add %[biasptr], %[biasptr], %[biasinc]\n"
                    "fmla v25.4s, v16.4s, v1.s[0]\n"
                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                    "fmla v26.4s, v16.4s, v2.s[0]\n"
                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                    "fmla v27.4s, v16.4s, v3.s[0]\n"
                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                    "fmla v28.4s, v16.4s, v4.s[0]\n"
                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                    "fmla v29.4s, v16.4s, v5.s[0]\n"
                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                    "fmla v30.4s, v16.4s, v6.s[0]\n"
                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                    "fmla v31.4s, v16.4s, v7.s[0]\n"
                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                    "fmla v24.4s, v17.4s, v0.s[1]\n"
                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                    "fmla v25.4s, v17.4s, v1.s[1]\n"
                    "fmla v26.4s, v17.4s, v2.s[1]\n"
                    "fmla v27.4s, v17.4s, v3.s[1]\n"
                    "fmla v28.4s, v17.4s, v4.s[1]\n"
                    "fmla v29.4s, v17.4s, v5.s[1]\n"
                    "fmla v30.4s, v17.4s, v6.s[1]\n"
                    "fmla v31.4s, v17.4s, v7.s[1]\n"
                    "b.ne 4b\n"
                    "3:\n"
                    "ld1r {v22.4s}, [%[minptr]]\n"
                    "ld1r {v23.4s}, [%[maxptr]]\n"
                    "ldr q16, [%[b_ptr0]]\n"
                    "ldr q17, [%[b_ptr0], #0x10]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x20\n"
                    "fmax v24.4s, v24.4s, v22.4s\n"
                    "fmax v25.4s, v25.4s, v22.4s\n"
                    "fmax v26.4s, v26.4s, v22.4s\n"
                    "fmax v27.4s, v27.4s, v22.4s\n"
                    "fmin v24.4s, v24.4s, v23.4s\n"
                    "fmin v25.4s, v25.4s, v23.4s\n"
                    "fmin v26.4s, v26.4s, v23.4s\n"
                    "fmin v27.4s, v27.4s, v23.4s\n"
                    "str q24, [%[c_ptr0]]\n"
                    "fmax v28.4s, v28.4s, v22.4s\n"
                    "ldr q24, [%[biasptr]]\n"
                    "fmax v29.4s, v29.4s, v22.4s\n"
                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
                    "fmax v30.4s, v30.4s, v22.4s\n"
                    "str q25, [c_ptr1]\n"
                    "fmin v28.4s, v28.4s, v23.4s\n"
                    "add c_ptr1, c_ptr1, #0x10\n"
                    "fmin v29.4s, v29.4s, v23.4s\n"
                    "str q26, [c_ptr2]\n"
                    "fmin v30.4s, v30.4s, v23.4s\n"
                    "add c_ptr2, c_ptr2, #0x10\n"
                    "fmax v31.4s, v31.4s, v22.4s\n"
                    "str q27, [c_ptr3]\n"
                    "mov v25.16b, v24.16b\n"
                    "add c_ptr3, c_ptr3, #0x10\n"
                    "mov v26.16b, v24.16b\n"
                    "str q28, [c_ptr4]\n"
                    "fmin v31.4s, v31.4s, v23.4s\n"
                    "add c_ptr4, c_ptr4, #0x10\n"
                    "mov v27.16b, v24.16b\n"
                    "str q29, [c_ptr5]\n"
                    "mov v28.16b, v24.16b\n"
                    "add c_ptr5, c_ptr5, #0x10\n"
                    "mov v29.16b, v24.16b\n"
                    "str q30, [c_ptr6]\n"
                    "mov v30.16b, v24.16b\n"
                    "add c_ptr6, c_ptr6, #0x10\n"
                    "fmla v25.4s, v16.4s, v1.s[0]\n"
                    "str q31, [c_ptr7]\n"
                    "mov v31.16b, v24.16b\n"
                    "add c_ptr7, c_ptr7, #0x10\n"
                    "fmla v24.4s, v16.4s, v0.s[0]\n"
                    "add %[biasptr], %[biasptr], %[biasinc]\n"
                    "fmla v26.4s, v16.4s, v2.s[0]\n"
                    "fmla v27.4s, v16.4s, v3.s[0]\n"
                    "fmla v28.4s, v16.4s, v4.s[0]\n"
                    "fmla v29.4s, v16.4s, v5.s[0]\n"
                    "fmla v30.4s, v16.4s, v6.s[0]\n"
                    "fmla v31.4s, v16.4s, v7.s[0]\n"
                    "fmla v24.4s, v17.4s, v0.s[1]\n"
                    "fmla v25.4s, v17.4s, v1.s[1]\n"
                    "fmla v26.4s, v17.4s, v2.s[1]\n"
                    "fmla v27.4s, v17.4s, v3.s[1]\n"
                    "fmla v28.4s, v17.4s, v4.s[1]\n"
                    "fmla v29.4s, v17.4s, v5.s[1]\n"
                    "fmla v30.4s, v17.4s, v6.s[1]\n"
                    "fmla v31.4s, v17.4s, v7.s[1]\n"
                    "b 5f\n"
                    "2:\n"
                    "ldr q24, [%[biasptr]]\n"
                    "add %[biasptr], %[biasptr], %[biasinc]\n"
                    "mov v25.16b, v24.16b\n"
                    "mov v26.16b, v24.16b\n"
                    "mov v27.16b, v24.16b\n"
                    "mov v28.16b, v24.16b\n"
                    "mov v29.16b, v24.16b\n"
                    "mov v30.16b, v24.16b\n"
                    "mov v31.16b, v24.16b\n"
                    "fmla v24.4s, v16.4s, v0.s[0]\n"
                    "fmla v25.4s, v16.4s, v1.s[0]\n"
                    "fmla v26.4s, v16.4s, v2.s[0]\n"
                    "fmla v27.4s, v16.4s, v3.s[0]\n"
                    "fmla v28.4s, v16.4s, v4.s[0]\n"
                    "fmla v29.4s, v16.4s, v5.s[0]\n"
                    "fmla v30.4s, v16.4s, v6.s[0]\n"
                    "fmla v31.4s, v16.4s, v7.s[0]\n"
                    "fmla v24.4s, v17.4s, v0.s[1]\n"
                    "fmla v25.4s, v17.4s, v1.s[1]\n"
                    "fmla v26.4s, v17.4s, v2.s[1]\n"
                    "fmla v27.4s, v17.4s, v3.s[1]\n"
                    "fmla v28.4s, v17.4s, v4.s[1]\n"
                    "fmla v29.4s, v17.4s, v5.s[1]\n"
                    "fmla v30.4s, v17.4s, v6.s[1]\n"
                    "fmla v31.4s, v17.4s, v7.s[1]\n"
                    "5:\n"
                    "ld1r {v22.4s}, [%[minptr]]\n"
                    "ld1r {v23.4s}, [%[maxptr]]\n"
                    "fmax v24.4s, v24.4s, v22.4s\n"
                    "fmax v25.4s, v25.4s, v22.4s\n"
                    "fmax v26.4s, v26.4s, v22.4s\n"
                    "fmax v27.4s, v27.4s, v22.4s\n"
                    "fmin v24.4s, v24.4s, v23.4s\n"
                    "fmin v25.4s, v25.4s, v23.4s\n"
                    "fmin v26.4s, v26.4s, v23.4s\n"
                    "fmin v27.4s, v27.4s, v23.4s\n"
                    "str q24, [%[c_ptr0]]\n"
                    "fmax v28.4s, v28.4s, v22.4s\n"
                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
                    "fmax v29.4s, v29.4s, v22.4s\n"
                    "str q25, [c_ptr1]\n"
                    "fmax v30.4s, v30.4s, v22.4s\n"
                    "fmin v28.4s, v28.4s, v23.4s\n"
                    "fmax v31.4s, v31.4s, v22.4s\n"
                    "str q26, [c_ptr2]\n"
                    "fmin v29.4s, v29.4s, v23.4s\n"
                    "fmin v30.4s, v30.4s, v23.4s\n"
                    "fmin v31.4s, v31.4s, v23.4s\n"
                    "str q27, [c_ptr3]\n"
                    "str q28, [c_ptr4]\n"
                    "str q29, [c_ptr5]\n"
                    "str q30, [c_ptr6]\n"
                    "str q31, [c_ptr7]\n"
                    ".unreq a_ptr1\n"
                    ".unreq a_ptr2\n"
                    ".unreq a_ptr3\n"
                    ".unreq a_ptr4\n"
                    ".unreq a_ptr5\n"
                    ".unreq a_ptr6\n"
                    ".unreq a_ptr7\n"
                    ".unreq c_ptr1\n"
                    ".unreq c_ptr2\n"
                    ".unreq c_ptr3\n"
                    ".unreq c_ptr4\n"
                    ".unreq c_ptr5\n"
                    ".unreq c_ptr6\n"
                    ".unreq c_ptr7\n"
                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [biasptr] "+r" (biasptr)
                    : [lda] "r" (ldab), [ldc] "r" (ldcb), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
                );
                break;
            case 3:
                __asm __volatile (
                    "a_ptr1 .req X0\n"
                    "a_ptr2 .req X1\n"
                    "a_ptr3 .req X2\n"
                    "a_ptr4 .req X3\n"
                    "a_ptr5 .req X4\n"
                    "a_ptr6 .req X5\n"
                    "a_ptr7 .req X6\n"
                    "c_ptr1 .req X7\n"
                    "c_ptr2 .req X8\n"
                    "c_ptr3 .req X9\n"
                    "c_ptr4 .req X10\n"
                    "c_ptr5 .req X11\n"
                    "c_ptr6 .req X12\n"
                    "c_ptr7 .req X13\n"
                    "add a_ptr1, %[a_ptr0], %[lda]\n"
                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
                    "add a_ptr2, a_ptr1, %[lda]\n"
                    "add c_ptr2, c_ptr1, %[ldc]\n"
                    "add a_ptr3, a_ptr2, %[lda]\n"
                    "add c_ptr3, c_ptr2, %[ldc]\n"
                    "add a_ptr4, a_ptr3, %[lda]\n"
                    "add c_ptr4, c_ptr3, %[ldc]\n"
                    "add a_ptr5, a_ptr4, %[lda]\n"
                    "add c_ptr5, c_ptr4, %[ldc]\n"
                    "add a_ptr6, a_ptr5, %[lda]\n"
                    "add c_ptr6, c_ptr5, %[ldc]\n"
                    "add a_ptr7, a_ptr6, %[lda]\n"
                    "add c_ptr7, c_ptr6, %[ldc]\n"
                    "cbz %[oob_rows], 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr7, %[c_ptr0], #0x0\n"
                    "add a_ptr7, %[a_ptr0], #0x0\n"
                    "b.eq 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr6, %[c_ptr0], #0x0\n"
                    "add a_ptr6, %[a_ptr0], #0x0\n"
                    "b.eq 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr5, %[c_ptr0], #0x0\n"
                    "add a_ptr5, %[a_ptr0], #0x0\n"
                    "b.eq 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr4, %[c_ptr0], #0x0\n"
                    "add a_ptr4, %[a_ptr0], #0x0\n"
                    "b.eq 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr3, %[c_ptr0], #0x0\n"
                    "add a_ptr3, %[a_ptr0], #0x0\n"
                    "b.eq 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr2, %[c_ptr0], #0x0\n"
                    "add a_ptr2, %[a_ptr0], #0x0\n"
                    "b.eq 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr1, %[c_ptr0], #0x0\n"
                    "add a_ptr1, %[a_ptr0], #0x0\n"
                    "1:\n"
                    "ldr q16, [%[b_ptr0]]\n"
                    "ldr d0, [%[a_ptr0]], #0x8\n"
                    "ldr q17, [%[b_ptr0], #0x10]\n"
                    "ldr d1, [a_ptr1], #0x8\n"
                    "ldr q18, [%[b_ptr0], #0x20]\n"
                    "ldr d2, [a_ptr2], #0x8\n"
                    "ldr d3, [a_ptr3], #0x8\n"
                    "ldr d4, [a_ptr4], #0x8\n"
                    "ldr d5, [a_ptr5], #0x8\n"
                    "ldr d6, [a_ptr6], #0x8\n"
                    "ldr d7, [a_ptr7], #0x8\n"
                    "ld1 {v0.s}[2], [%[a_ptr0]]\n"
                    "ld1 {v1.s}[2], [a_ptr1]\n"
                    "ld1 {v2.s}[2], [a_ptr2]\n"
                    "ld1 {v3.s}[2], [a_ptr3]\n"
                    "ld1 {v4.s}[2], [a_ptr4]\n"
                    "ld1 {v5.s}[2], [a_ptr5]\n"
                    "ld1 {v6.s}[2], [a_ptr6]\n"
                    "ld1 {v7.s}[2], [a_ptr7]\n"
                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x30\n"
                    "cbz %[loops], 2f\n"
                    "ldr q24, [%[biasptr]]\n"
                    "add %[biasptr], %[biasptr], %[biasinc]\n"
                    "subs %[loops], %[loops], #0x1\n"
                    "mov v25.16b, v24.16b\n"
                    "mov v26.16b, v24.16b\n"
                    "mov v27.16b, v24.16b\n"
                    "mov v28.16b, v24.16b\n"
                    "mov v29.16b, v24.16b\n"
                    "mov v30.16b, v24.16b\n"
                    "mov v31.16b, v24.16b\n"
                    "fmla v24.4s, v16.4s, v0.s[0]\n"
                    "fmla v25.4s, v16.4s, v1.s[0]\n"
                    "fmla v26.4s, v16.4s, v2.s[0]\n"
                    "fmla v27.4s, v16.4s, v3.s[0]\n"
                    "fmla v28.4s, v16.4s, v4.s[0]\n"
                    "fmla v29.4s, v16.4s, v5.s[0]\n"
                    "fmla v30.4s, v16.4s, v6.s[0]\n"
                    "fmla v31.4s, v16.4s, v7.s[0]\n"
                    "fmla v24.4s, v17.4s, v0.s[1]\n"
                    "fmla v25.4s, v17.4s, v1.s[1]\n"
                    "fmla v26.4s, v17.4s, v2.s[1]\n"
                    "fmla v27.4s, v17.4s, v3.s[1]\n"
                    "fmla v28.4s, v17.4s, v4.s[1]\n"
                    "fmla v29.4s, v17.4s, v5.s[1]\n"
                    "fmla v30.4s, v17.4s, v6.s[1]\n"
                    "fmla v31.4s, v17.4s, v7.s[1]\n"
                    "fmla v24.4s, v18.4s, v0.s[2]\n"
                    "fmla v25.4s, v18.4s, v1.s[2]\n"
                    "fmla v26.4s, v18.4s, v2.s[2]\n"
                    "fmla v27.4s, v18.4s, v3.s[2]\n"
                    "fmla v28.4s, v18.4s, v4.s[2]\n"
                    "fmla v29.4s, v18.4s, v5.s[2]\n"
                    "fmla v30.4s, v18.4s, v6.s[2]\n"
                    "fmla v31.4s, v18.4s, v7.s[2]\n"
                    "b.eq 3f\n"
                    "4:\n"
                    "ld1r {v22.4s}, [%[minptr]]\n"
                    "subs %[loops], %[loops], #0x1\n"
                    "ld1r {v23.4s}, [%[maxptr]]\n"
                    "ldr q16, [%[b_ptr0]]\n"
                    "fmax v24.4s, v24.4s, v22.4s\n"
                    "ldr q17, [%[b_ptr0], #0x10]\n"
                    "fmax v25.4s, v25.4s, v22.4s\n"
                    "ldr q18, [%[b_ptr0], #0x20]\n"
                    "fmax v26.4s, v26.4s, v22.4s\n"
                    "add %[b_ptr0], %[b_ptr0], #0x30\n"
                    "fmin v24.4s, v24.4s, v23.4s\n"
                    "fmin v25.4s, v25.4s, v23.4s\n"
                    "fmin v26.4s, v26.4s, v23.4s\n"
                    "fmax v27.4s, v27.4s, v22.4s\n"
                    "str q24, [%[c_ptr0]]\n"
                    "fmax v28.4s, v28.4s, v22.4s\n"
                    "ldr q24, [%[biasptr]]\n"
                    "fmax v29.4s, v29.4s, v22.4s\n"
                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
                    "fmin v27.4s, v27.4s, v23.4s\n"
                    "str q25, [c_ptr1]\n"
                    "fmin v28.4s, v28.4s, v23.4s\n"
                    "add c_ptr1, c_ptr1, #0x10\n"
                    "fmin v29.4s, v29.4s, v23.4s\n"
                    "str q26, [c_ptr2]\n"
                    "fmax v30.4s, v30.4s, v22.4s\n"
                    "add c_ptr2, c_ptr2, #0x10\n"
                    "fmax v31.4s, v31.4s, v22.4s\n"
                    "str q27, [c_ptr3]\n"
                    "mov v25.16b, v24.16b\n"
                    "add c_ptr3, c_ptr3, #0x10\n"
                    "fmin v30.4s, v30.4s, v23.4s\n"
                    "str q28, [c_ptr4]\n"
                    "fmin v31.4s, v31.4s, v23.4s\n"
                    "add c_ptr4, c_ptr4, #0x10\n"
                    "mov v26.16b, v24.16b\n"
                    "str q29, [c_ptr5]\n"
                    "mov v27.16b, v24.16b\n"
                    "add c_ptr5, c_ptr5, #0x10\n"
                    "mov v28.16b, v24.16b\n"
                    "str q30, [c_ptr6]\n"
                    "mov v29.16b, v24.16b\n"
                    "add c_ptr6, c_ptr6, #0x10\n"
                    "mov v30.16b, v24.16b\n"
                    "str q31, [c_ptr7]\n"
                    "mov v31.16b, v24.16b\n"
                    "add c_ptr7, c_ptr7, #0x10\n"
                    "fmla v24.4s, v16.4s, v0.s[0]\n"
                    "add %[biasptr], %[biasptr], %[biasinc]\n"
                    "fmla v25.4s, v16.4s, v1.s[0]\n"
                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                    "fmla v26.4s, v16.4s, v2.s[0]\n"
                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                    "fmla v27.4s, v16.4s, v3.s[0]\n"
                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                    "fmla v28.4s, v16.4s, v4.s[0]\n"
                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                    "fmla v29.4s, v16.4s, v5.s[0]\n"
                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                    "fmla v30.4s, v16.4s, v6.s[0]\n"
                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                    "fmla v31.4s, v16.4s, v7.s[0]\n"
                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                    "fmla v24.4s, v17.4s, v0.s[1]\n"
                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                    "fmla v25.4s, v17.4s, v1.s[1]\n"
                    "fmla v26.4s, v17.4s, v2.s[1]\n"
                    "fmla v27.4s, v17.4s, v3.s[1]\n"
                    "fmla v28.4s, v17.4s, v4.s[1]\n"
                    "fmla v29.4s, v17.4s, v5.s[1]\n"
                    "fmla v30.4s, v17.4s, v6.s[1]\n"
                    "fmla v31.4s, v17.4s, v7.s[1]\n"
                    "fmla v24.4s, v18.4s, v0.s[2]\n"
                    "fmla v25.4s, v18.4s, v1.s[2]\n"
                    "fmla v26.4s, v18.4s, v2.s[2]\n"
                    "fmla v27.4s, v18.4s, v3.s[2]\n"
                    "fmla v28.4s, v18.4s, v4.s[2]\n"
                    "fmla v29.4s, v18.4s, v5.s[2]\n"
                    "fmla v30.4s, v18.4s, v6.s[2]\n"
                    "fmla v31.4s, v18.4s, v7.s[2]\n"
                    "b.ne 4b\n"
                    "3:\n"
                    "ld1r {v22.4s}, [%[minptr]]\n"
                    "ld1r {v23.4s}, [%[maxptr]]\n"
                    "ldr q16, [%[b_ptr0]]\n"
                    "ldr q17, [%[b_ptr0], #0x10]\n"
                    "fmax v24.4s, v24.4s, v22.4s\n"
                    "ldr q18, [%[b_ptr0], #0x20]\n"
                    "fmax v25.4s, v25.4s, v22.4s\n"
                    "add %[b_ptr0], %[b_ptr0], #0x30\n"
                    "fmax v26.4s, v26.4s, v22.4s\n"
                    "fmin v24.4s, v24.4s, v23.4s\n"
                    "fmin v25.4s, v25.4s, v23.4s\n"
                    "fmax v27.4s, v27.4s, v22.4s\n"
                    "fmin v26.4s, v26.4s, v23.4s\n"
                    "str q24, [%[c_ptr0]]\n"
                    "fmax v28.4s, v28.4s, v22.4s\n"
                    "ldr q24, [%[biasptr]]\n"
                    "fmax v29.4s, v29.4s, v22.4s\n"
                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
                    "fmin v27.4s, v27.4s, v23.4s\n"
                    "str q25, [c_ptr1]\n"
                    "fmin v28.4s, v28.4s, v23.4s\n"
                    "add c_ptr1, c_ptr1, #0x10\n"
                    "fmin v29.4s, v29.4s, v23.4s\n"
                    "str q26, [c_ptr2]\n"
                    "fmax v30.4s, v30.4s, v22.4s\n"
                    "add c_ptr2, c_ptr2, #0x10\n"
                    "fmax v31.4s, v31.4s, v22.4s\n"
                    "str q27, [c_ptr3]\n"
                    "mov v25.16b, v24.16b\n"
                    "add c_ptr3, c_ptr3, #0x10\n"
                    "fmin v30.4s, v30.4s, v23.4s\n"
                    "str q28, [c_ptr4]\n"
                    "fmin v31.4s, v31.4s, v23.4s\n"
                    "add c_ptr4, c_ptr4, #0x10\n"
                    "mov v26.16b, v24.16b\n"
                    "str q29, [c_ptr5]\n"
                    "mov v27.16b, v24.16b\n"
                    "add c_ptr5, c_ptr5, #0x10\n"
                    "mov v28.16b, v24.16b\n"
                    "str q30, [c_ptr6]\n"
                    "mov v29.16b, v24.16b\n"
                    "add c_ptr6, c_ptr6, #0x10\n"
                    "mov v30.16b, v24.16b\n"
                    "str q31, [c_ptr7]\n"
                    "mov v31.16b, v24.16b\n"
                    "add c_ptr7, c_ptr7, #0x10\n"
                    "fmla v24.4s, v16.4s, v0.s[0]\n"
                    "add %[biasptr], %[biasptr], %[biasinc]\n"
                    "fmla v25.4s, v16.4s, v1.s[0]\n"
                    "fmla v26.4s, v16.4s, v2.s[0]\n"
                    "fmla v27.4s, v16.4s, v3.s[0]\n"
                    "fmla v28.4s, v16.4s, v4.s[0]\n"
                    "fmla v29.4s, v16.4s, v5.s[0]\n"
                    "fmla v30.4s, v16.4s, v6.s[0]\n"
                    "fmla v31.4s, v16.4s, v7.s[0]\n"
                    "fmla v24.4s, v17.4s, v0.s[1]\n"
                    "fmla v25.4s, v17.4s, v1.s[1]\n"
                    "fmla v26.4s, v17.4s, v2.s[1]\n"
                    "fmla v27.4s, v17.4s, v3.s[1]\n"
                    "fmla v28.4s, v17.4s, v4.s[1]\n"
                    "fmla v29.4s, v17.4s, v5.s[1]\n"
                    "fmla v30.4s, v17.4s, v6.s[1]\n"
                    "fmla v31.4s, v17.4s, v7.s[1]\n"
                    "fmla v24.4s, v18.4s, v0.s[2]\n"
                    "fmla v25.4s, v18.4s, v1.s[2]\n"
                    "fmla v26.4s, v18.4s, v2.s[2]\n"
                    "fmla v27.4s, v18.4s, v3.s[2]\n"
                    "fmla v28.4s, v18.4s, v4.s[2]\n"
                    "fmla v29.4s, v18.4s, v5.s[2]\n"
                    "fmla v30.4s, v18.4s, v6.s[2]\n"
                    "fmla v31.4s, v18.4s, v7.s[2]\n"
                    "b 5f\n"
                    "2:\n"
                    "ldr q24, [%[biasptr]]\n"
                    "add %[biasptr], %[biasptr], %[biasinc]\n"
                    "mov v25.16b, v24.16b\n"
                    "mov v26.16b, v24.16b\n"
                    "mov v27.16b, v24.16b\n"
                    "mov v28.16b, v24.16b\n"
                    "mov v29.16b, v24.16b\n"
                    "mov v30.16b, v24.16b\n"
                    "mov v31.16b, v24.16b\n"
                    "fmla v24.4s, v16.4s, v0.s[0]\n"
                    "fmla v25.4s, v16.4s, v1.s[0]\n"
                    "fmla v26.4s, v16.4s, v2.s[0]\n"
                    "fmla v27.4s, v16.4s, v3.s[0]\n"
                    "fmla v28.4s, v16.4s, v4.s[0]\n"
                    "fmla v29.4s, v16.4s, v5.s[0]\n"
                    "fmla v30.4s, v16.4s, v6.s[0]\n"
                    "fmla v31.4s, v16.4s, v7.s[0]\n"
                    "fmla v24.4s, v17.4s, v0.s[1]\n"
                    "fmla v25.4s, v17.4s, v1.s[1]\n"
                    "fmla v26.4s, v17.4s, v2.s[1]\n"
                    "fmla v27.4s, v17.4s, v3.s[1]\n"
                    "fmla v28.4s, v17.4s, v4.s[1]\n"
                    "fmla v29.4s, v17.4s, v5.s[1]\n"
                    "fmla v30.4s, v17.4s, v6.s[1]\n"
                    "fmla v31.4s, v17.4s, v7.s[1]\n"
                    "fmla v24.4s, v18.4s, v0.s[2]\n"
                    "fmla v25.4s, v18.4s, v1.s[2]\n"
                    "fmla v26.4s, v18.4s, v2.s[2]\n"
                    "fmla v27.4s, v18.4s, v3.s[2]\n"
                    "fmla v28.4s, v18.4s, v4.s[2]\n"
                    "fmla v29.4s, v18.4s, v5.s[2]\n"
                    "fmla v30.4s, v18.4s, v6.s[2]\n"
                    "fmla v31.4s, v18.4s, v7.s[2]\n"
                    "5:\n"
                    "ld1r {v22.4s}, [%[minptr]]\n"
                    "ld1r {v23.4s}, [%[maxptr]]\n"
                    "fmax v24.4s, v24.4s, v22.4s\n"
                    "fmax v25.4s, v25.4s, v22.4s\n"
                    "fmax v26.4s, v26.4s, v22.4s\n"
                    "fmax v27.4s, v27.4s, v22.4s\n"
                    "fmin v24.4s, v24.4s, v23.4s\n"
                    "fmin v25.4s, v25.4s, v23.4s\n"
                    "fmin v26.4s, v26.4s, v23.4s\n"
                    "fmin v27.4s, v27.4s, v23.4s\n"
                    "str q24, [%[c_ptr0]]\n"
                    "fmax v28.4s, v28.4s, v22.4s\n"
                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
                    "fmax v29.4s, v29.4s, v22.4s\n"
                    "str q25, [c_ptr1]\n"
                    "fmax v30.4s, v30.4s, v22.4s\n"
                    "fmin v28.4s, v28.4s, v23.4s\n"
                    "fmax v31.4s, v31.4s, v22.4s\n"
                    "str q26, [c_ptr2]\n"
                    "fmin v29.4s, v29.4s, v23.4s\n"
                    "fmin v30.4s, v30.4s, v23.4s\n"
                    "fmin v31.4s, v31.4s, v23.4s\n"
                    "str q27, [c_ptr3]\n"
                    "str q28, [c_ptr4]\n"
                    "str q29, [c_ptr5]\n"
                    "str q30, [c_ptr6]\n"
                    "str q31, [c_ptr7]\n"
                    ".unreq a_ptr1\n"
                    ".unreq a_ptr2\n"
                    ".unreq a_ptr3\n"
                    ".unreq a_ptr4\n"
                    ".unreq a_ptr5\n"
                    ".unreq a_ptr6\n"
                    ".unreq a_ptr7\n"
                    ".unreq c_ptr1\n"
                    ".unreq c_ptr2\n"
                    ".unreq c_ptr3\n"
                    ".unreq c_ptr4\n"
                    ".unreq c_ptr5\n"
                    ".unreq c_ptr6\n"
                    ".unreq c_ptr7\n"
                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [biasptr] "+r" (biasptr)
                    : [lda] "r" (ldab), [ldc] "r" (ldcb), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
                );
                break;
            case 4:
                __asm __volatile (
                    "a_ptr1 .req X0\n"
                    "a_ptr2 .req X1\n"
                    "a_ptr3 .req X2\n"
                    "a_ptr4 .req X3\n"
                    "a_ptr5 .req X4\n"
                    "a_ptr6 .req X5\n"
                    "a_ptr7 .req X6\n"
                    "c_ptr1 .req X7\n"
                    "c_ptr2 .req X8\n"
                    "c_ptr3 .req X9\n"
                    "c_ptr4 .req X10\n"
                    "c_ptr5 .req X11\n"
                    "c_ptr6 .req X12\n"
                    "c_ptr7 .req X13\n"
                    "add a_ptr1, %[a_ptr0], %[lda]\n"
                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
                    "add a_ptr2, a_ptr1, %[lda]\n"
                    "add c_ptr2, c_ptr1, %[ldc]\n"
                    "add a_ptr3, a_ptr2, %[lda]\n"
                    "add c_ptr3, c_ptr2, %[ldc]\n"
                    "add a_ptr4, a_ptr3, %[lda]\n"
                    "add c_ptr4, c_ptr3, %[ldc]\n"
                    "add a_ptr5, a_ptr4, %[lda]\n"
                    "add c_ptr5, c_ptr4, %[ldc]\n"
                    "add a_ptr6, a_ptr5, %[lda]\n"
                    "add c_ptr6, c_ptr5, %[ldc]\n"
                    "add a_ptr7, a_ptr6, %[lda]\n"
                    "add c_ptr7, c_ptr6, %[ldc]\n"
                    "cbz %[oob_rows], 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr7, %[c_ptr0], #0x0\n"
                    "add a_ptr7, %[a_ptr0], #0x0\n"
                    "b.eq 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr6, %[c_ptr0], #0x0\n"
                    "add a_ptr6, %[a_ptr0], #0x0\n"
                    "b.eq 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr5, %[c_ptr0], #0x0\n"
                    "add a_ptr5, %[a_ptr0], #0x0\n"
                    "b.eq 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr4, %[c_ptr0], #0x0\n"
                    "add a_ptr4, %[a_ptr0], #0x0\n"
                    "b.eq 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr3, %[c_ptr0], #0x0\n"
                    "add a_ptr3, %[a_ptr0], #0x0\n"
                    "b.eq 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr2, %[c_ptr0], #0x0\n"
                    "add a_ptr2, %[a_ptr0], #0x0\n"
                    "b.eq 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr1, %[c_ptr0], #0x0\n"
                    "add a_ptr1, %[a_ptr0], #0x0\n"
                    "1:\n"
                    "ldr q0, [%[a_ptr0]]\n"
                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                    "ldr q1, [a_ptr1]\n"
                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                    "ldr q2, [a_ptr2]\n"
                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                    "ldr q3, [a_ptr3]\n"
                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                    "ldr q4, [a_ptr4]\n"
                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                    "ldr q5, [a_ptr5]\n"
                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                    "ldr q6, [a_ptr6]\n"
                    "ldr q7, [a_ptr7]\n"
                    "ldr q16, [%[b_ptr0]]\n"
                    "ldr q17, [%[b_ptr0], #0x10]\n"
                    "ldr q18, [%[b_ptr0], #0x20]\n"
                    "ldr q19, [%[b_ptr0], #0x30]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
                    "cbz %[loops], 2f\n"
                    "ldr q24, [%[biasptr]]\n"
                    "add %[biasptr], %[biasptr], %[biasinc]\n"
                    "subs %[loops], %[loops], #0x1\n"
                    "mov v25.16b, v24.16b\n"
                    "mov v26.16b, v24.16b\n"
                    "mov v27.16b, v24.16b\n"
                    "mov v28.16b, v24.16b\n"
                    "mov v29.16b, v24.16b\n"
                    "mov v30.16b, v24.16b\n"
                    "mov v31.16b, v24.16b\n"
                    "fmla v24.4s, v16.4s, v0.s[0]\n"
                    "fmla v25.4s, v16.4s, v1.s[0]\n"
                    "fmla v26.4s, v16.4s, v2.s[0]\n"
                    "fmla v27.4s, v16.4s, v3.s[0]\n"
                    "fmla v28.4s, v16.4s, v4.s[0]\n"
                    "fmla v29.4s, v16.4s, v5.s[0]\n"
                    "fmla v30.4s, v16.4s, v6.s[0]\n"
                    "fmla v31.4s, v16.4s, v7.s[0]\n"
                    "fmla v24.4s, v17.4s, v0.s[1]\n"
                    "fmla v25.4s, v17.4s, v1.s[1]\n"
                    "fmla v26.4s, v17.4s, v2.s[1]\n"
                    "fmla v27.4s, v17.4s, v3.s[1]\n"
                    "fmla v28.4s, v17.4s, v4.s[1]\n"
                    "fmla v29.4s, v17.4s, v5.s[1]\n"
                    "fmla v30.4s, v17.4s, v6.s[1]\n"
                    "fmla v31.4s, v17.4s, v7.s[1]\n"
                    "fmla v24.4s, v18.4s, v0.s[2]\n"
                    "fmla v25.4s, v18.4s, v1.s[2]\n"
                    "fmla v26.4s, v18.4s, v2.s[2]\n"
                    "fmla v27.4s, v18.4s, v3.s[2]\n"
                    "fmla v28.4s, v18.4s, v4.s[2]\n"
                    "fmla v29.4s, v18.4s, v5.s[2]\n"
                    "fmla v30.4s, v18.4s, v6.s[2]\n"
                    "fmla v31.4s, v18.4s, v7.s[2]\n"
                    "fmla v24.4s, v19.4s, v0.s[3]\n"
                    "fmla v25.4s, v19.4s, v1.s[3]\n"
                    "fmla v26.4s, v19.4s, v2.s[3]\n"
                    "fmla v27.4s, v19.4s, v3.s[3]\n"
                    "fmla v28.4s, v19.4s, v4.s[3]\n"
                    "fmla v29.4s, v19.4s, v5.s[3]\n"
                    "fmla v30.4s, v19.4s, v6.s[3]\n"
                    "fmla v31.4s, v19.4s, v7.s[3]\n"
                    "b.eq 3f\n"
                    "4:\n"
                    "ld1r {v22.4s}, [%[minptr]]\n"
                    "subs %[loops], %[loops], #0x1\n"
                    "ld1r {v23.4s}, [%[maxptr]]\n"
                    "ldr q16, [%[b_ptr0]]\n"
                    "fmax v24.4s, v24.4s, v22.4s\n"
                    "ldr q17, [%[b_ptr0], #0x10]\n"
                    "fmax v25.4s, v25.4s, v22.4s\n"
                    "ldr q18, [%[b_ptr0], #0x20]\n"
                    "fmax v26.4s, v26.4s, v22.4s\n"
                    "ldr q19, [%[b_ptr0], #0x30]\n"
                    "fmax v27.4s, v27.4s, v22.4s\n"
                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
                    "fmin v24.4s, v24.4s, v23.4s\n"
                    "fmin v25.4s, v25.4s, v23.4s\n"
                    "fmin v26.4s, v26.4s, v23.4s\n"
                    "fmin v27.4s, v27.4s, v23.4s\n"
                    "str q24, [%[c_ptr0]]\n"
                    "fmax v28.4s, v28.4s, v22.4s\n"
                    "ldr q24, [%[biasptr]]\n"
                    "fmax v29.4s, v29.4s, v22.4s\n"
                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
                    "fmax v30.4s, v30.4s, v22.4s\n"
                    "str q25, [c_ptr1]\n"
                    "fmin v28.4s, v28.4s, v23.4s\n"
                    "add c_ptr1, c_ptr1, #0x10\n"
                    "fmin v29.4s, v29.4s, v23.4s\n"
                    "str q26, [c_ptr2]\n"
                    "fmin v30.4s, v30.4s, v23.4s\n"
                    "add c_ptr2, c_ptr2, #0x10\n"
                    "fmax v31.4s, v31.4s, v22.4s\n"
                    "str q27, [c_ptr3]\n"
                    "mov v25.16b, v24.16b\n"
                    "add c_ptr3, c_ptr3, #0x10\n"
                    "mov v26.16b, v24.16b\n"
                    "str q28, [c_ptr4]\n"
                    "fmin v31.4s, v31.4s, v23.4s\n"
                    "add c_ptr4, c_ptr4, #0x10\n"
                    "mov v27.16b, v24.16b\n"
                    "str q29, [c_ptr5]\n"
                    "mov v28.16b, v24.16b\n"
                    "add c_ptr5, c_ptr5, #0x10\n"
                    "mov v29.16b, v24.16b\n"
                    "str q30, [c_ptr6]\n"
                    "mov v30.16b, v24.16b\n"
                    "add c_ptr6, c_ptr6, #0x10\n"
                    "fmla v25.4s, v16.4s, v1.s[0]\n"
                    "str q31, [c_ptr7]\n"
                    "mov v31.16b, v24.16b\n"
                    "add c_ptr7, c_ptr7, #0x10\n"
                    "fmla v24.4s, v16.4s, v0.s[0]\n"
                    "add %[biasptr], %[biasptr], %[biasinc]\n"
                    "fmla v26.4s, v16.4s, v2.s[0]\n"
                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                    "fmla v27.4s, v16.4s, v3.s[0]\n"
                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                    "fmla v28.4s, v16.4s, v4.s[0]\n"
                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                    "fmla v29.4s, v16.4s, v5.s[0]\n"
                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                    "fmla v30.4s, v16.4s, v6.s[0]\n"
                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                    "fmla v31.4s, v16.4s, v7.s[0]\n"
                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                    "fmla v24.4s, v17.4s, v0.s[1]\n"
                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                    "fmla v25.4s, v17.4s, v1.s[1]\n"
                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                    "fmla v26.4s, v17.4s, v2.s[1]\n"
                    "fmla v27.4s, v17.4s, v3.s[1]\n"
                    "fmla v28.4s, v17.4s, v4.s[1]\n"
                    "fmla v29.4s, v17.4s, v5.s[1]\n"
                    "fmla v30.4s, v17.4s, v6.s[1]\n"
                    "fmla v31.4s, v17.4s, v7.s[1]\n"
                    "fmla v24.4s, v18.4s, v0.s[2]\n"
                    "fmla v25.4s, v18.4s, v1.s[2]\n"
                    "fmla v26.4s, v18.4s, v2.s[2]\n"
                    "fmla v27.4s, v18.4s, v3.s[2]\n"
                    "fmla v28.4s, v18.4s, v4.s[2]\n"
                    "fmla v29.4s, v18.4s, v5.s[2]\n"
                    "fmla v30.4s, v18.4s, v6.s[2]\n"
                    "fmla v31.4s, v18.4s, v7.s[2]\n"
                    "fmla v24.4s, v19.4s, v0.s[3]\n"
                    "fmla v25.4s, v19.4s, v1.s[3]\n"
                    "fmla v26.4s, v19.4s, v2.s[3]\n"
                    "fmla v27.4s, v19.4s, v3.s[3]\n"
                    "fmla v28.4s, v19.4s, v4.s[3]\n"
                    "fmla v29.4s, v19.4s, v5.s[3]\n"
                    "fmla v30.4s, v19.4s, v6.s[3]\n"
                    "fmla v31.4s, v19.4s, v7.s[3]\n"
                    "b.ne 4b\n"
                    "3:\n"
                    "ld1r {v22.4s}, [%[minptr]]\n"
                    "ld1r {v23.4s}, [%[maxptr]]\n"
                    "ldr q16, [%[b_ptr0]]\n"
                    "ldr q17, [%[b_ptr0], #0x10]\n"
                    "fmax v24.4s, v24.4s, v22.4s\n"
                    "ldr q18, [%[b_ptr0], #0x20]\n"
                    "fmax v25.4s, v25.4s, v22.4s\n"
                    "ldr q19, [%[b_ptr0], #0x30]\n"
                    "fmax v26.4s, v26.4s, v22.4s\n"
                    "add %[b_ptr0], %[b_ptr0], #0x40\n"
                    "fmin v24.4s, v24.4s, v23.4s\n"
                    "fmin v25.4s, v25.4s, v23.4s\n"
                    "fmin v26.4s, v26.4s, v23.4s\n"
                    "fmax v27.4s, v27.4s, v22.4s\n"
                    "str q24, [%[c_ptr0]]\n"
                    "fmax v28.4s, v28.4s, v22.4s\n"
                    "ldr q24, [%[biasptr]]\n"
                    "fmax v29.4s, v29.4s, v22.4s\n"
                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
                    "fmin v27.4s, v27.4s, v23.4s\n"
                    "str q25, [c_ptr1]\n"
                    "fmin v28.4s, v28.4s, v23.4s\n"
                    "add c_ptr1, c_ptr1, #0x10\n"
                    "fmin v29.4s, v29.4s, v23.4s\n"
                    "str q26, [c_ptr2]\n"
                    "fmax v30.4s, v30.4s, v22.4s\n"
                    "add c_ptr2, c_ptr2, #0x10\n"
                    "fmax v31.4s, v31.4s, v22.4s\n"
                    "str q27, [c_ptr3]\n"
                    "mov v25.16b, v24.16b\n"
                    "add c_ptr3, c_ptr3, #0x10\n"
                    "fmin v30.4s, v30.4s, v23.4s\n"
                    "str q28, [c_ptr4]\n"
                    "fmin v31.4s, v31.4s, v23.4s\n"
                    "add c_ptr4, c_ptr4, #0x10\n"
                    "mov v26.16b, v24.16b\n"
                    "str q29, [c_ptr5]\n"
                    "mov v27.16b, v24.16b\n"
                    "add c_ptr5, c_ptr5, #0x10\n"
                    "mov v28.16b, v24.16b\n"
                    "str q30, [c_ptr6]\n"
                    "mov v29.16b, v24.16b\n"
                    "add c_ptr6, c_ptr6, #0x10\n"
                    "mov v30.16b, v24.16b\n"
                    "str q31, [c_ptr7]\n"
                    "mov v31.16b, v24.16b\n"
                    "add c_ptr7, c_ptr7, #0x10\n"
                    "fmla v24.4s, v16.4s, v0.s[0]\n"
                    "add %[biasptr], %[biasptr], %[biasinc]\n"
                    "fmla v25.4s, v16.4s, v1.s[0]\n"
                    "fmla v26.4s, v16.4s, v2.s[0]\n"
                    "fmla v27.4s, v16.4s, v3.s[0]\n"
                    "fmla v28.4s, v16.4s, v4.s[0]\n"
                    "fmla v29.4s, v16.4s, v5.s[0]\n"
                    "fmla v30.4s, v16.4s, v6.s[0]\n"
                    "fmla v31.4s, v16.4s, v7.s[0]\n"
                    "fmla v24.4s, v17.4s, v0.s[1]\n"
                    "fmla v25.4s, v17.4s, v1.s[1]\n"
                    "fmla v26.4s, v17.4s, v2.s[1]\n"
                    "fmla v27.4s, v17.4s, v3.s[1]\n"
                    "fmla v28.4s, v17.4s, v4.s[1]\n"
                    "fmla v29.4s, v17.4s, v5.s[1]\n"
                    "fmla v30.4s, v17.4s, v6.s[1]\n"
                    "fmla v31.4s, v17.4s, v7.s[1]\n"
                    "fmla v24.4s, v18.4s, v0.s[2]\n"
                    "fmla v25.4s, v18.4s, v1.s[2]\n"
                    "fmla v26.4s, v18.4s, v2.s[2]\n"
                    "fmla v27.4s, v18.4s, v3.s[2]\n"
                    "fmla v28.4s, v18.4s, v4.s[2]\n"
                    "fmla v29.4s, v18.4s, v5.s[2]\n"
                    "fmla v30.4s, v18.4s, v6.s[2]\n"
                    "fmla v31.4s, v18.4s, v7.s[2]\n"
                    "fmla v24.4s, v19.4s, v0.s[3]\n"
                    "fmla v25.4s, v19.4s, v1.s[3]\n"
                    "fmla v26.4s, v19.4s, v2.s[3]\n"
                    "fmla v27.4s, v19.4s, v3.s[3]\n"
                    "fmla v28.4s, v19.4s, v4.s[3]\n"
                    "fmla v29.4s, v19.4s, v5.s[3]\n"
                    "fmla v30.4s, v19.4s, v6.s[3]\n"
                    "fmla v31.4s, v19.4s, v7.s[3]\n"
                    "b 5f\n"
                    "2:\n"
                    "ldr q24, [%[biasptr]]\n"
                    "add %[biasptr], %[biasptr], %[biasinc]\n"
                    "mov v25.16b, v24.16b\n"
                    "mov v26.16b, v24.16b\n"
                    "mov v27.16b, v24.16b\n"
                    "mov v28.16b, v24.16b\n"
                    "mov v29.16b, v24.16b\n"
                    "mov v30.16b, v24.16b\n"
                    "mov v31.16b, v24.16b\n"
                    "fmla v24.4s, v16.4s, v0.s[0]\n"
                    "fmla v25.4s, v16.4s, v1.s[0]\n"
                    "fmla v26.4s, v16.4s, v2.s[0]\n"
                    "fmla v27.4s, v16.4s, v3.s[0]\n"
                    "fmla v28.4s, v16.4s, v4.s[0]\n"
                    "fmla v29.4s, v16.4s, v5.s[0]\n"
                    "fmla v30.4s, v16.4s, v6.s[0]\n"
                    "fmla v31.4s, v16.4s, v7.s[0]\n"
                    "fmla v24.4s, v17.4s, v0.s[1]\n"
                    "fmla v25.4s, v17.4s, v1.s[1]\n"
                    "fmla v26.4s, v17.4s, v2.s[1]\n"
                    "fmla v27.4s, v17.4s, v3.s[1]\n"
                    "fmla v28.4s, v17.4s, v4.s[1]\n"
                    "fmla v29.4s, v17.4s, v5.s[1]\n"
                    "fmla v30.4s, v17.4s, v6.s[1]\n"
                    "fmla v31.4s, v17.4s, v7.s[1]\n"
                    "fmla v24.4s, v18.4s, v0.s[2]\n"
                    "fmla v25.4s, v18.4s, v1.s[2]\n"
                    "fmla v26.4s, v18.4s, v2.s[2]\n"
                    "fmla v27.4s, v18.4s, v3.s[2]\n"
                    "fmla v28.4s, v18.4s, v4.s[2]\n"
                    "fmla v29.4s, v18.4s, v5.s[2]\n"
                    "fmla v30.4s, v18.4s, v6.s[2]\n"
                    "fmla v31.4s, v18.4s, v7.s[2]\n"
                    "fmla v24.4s, v19.4s, v0.s[3]\n"
                    "fmla v25.4s, v19.4s, v1.s[3]\n"
                    "fmla v26.4s, v19.4s, v2.s[3]\n"
                    "fmla v27.4s, v19.4s, v3.s[3]\n"
                    "fmla v28.4s, v19.4s, v4.s[3]\n"
                    "fmla v29.4s, v19.4s, v5.s[3]\n"
                    "fmla v30.4s, v19.4s, v6.s[3]\n"
                    "fmla v31.4s, v19.4s, v7.s[3]\n"
                    "5:\n"
                    "ld1r {v22.4s}, [%[minptr]]\n"
                    "ld1r {v23.4s}, [%[maxptr]]\n"
                    "fmax v24.4s, v24.4s, v22.4s\n"
                    "fmax v25.4s, v25.4s, v22.4s\n"
                    "fmax v26.4s, v26.4s, v22.4s\n"
                    "fmax v27.4s, v27.4s, v22.4s\n"
                    "fmin v24.4s, v24.4s, v23.4s\n"
                    "fmin v25.4s, v25.4s, v23.4s\n"
                    "fmin v26.4s, v26.4s, v23.4s\n"
                    "fmin v27.4s, v27.4s, v23.4s\n"
                    "str q24, [%[c_ptr0]]\n"
                    "fmax v28.4s, v28.4s, v22.4s\n"
                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
                    "fmax v29.4s, v29.4s, v22.4s\n"
                    "str q25, [c_ptr1]\n"
                    "fmax v30.4s, v30.4s, v22.4s\n"
                    "fmin v28.4s, v28.4s, v23.4s\n"
                    "fmax v31.4s, v31.4s, v22.4s\n"
                    "str q26, [c_ptr2]\n"
                    "fmin v29.4s, v29.4s, v23.4s\n"
                    "fmin v30.4s, v30.4s, v23.4s\n"
                    "fmin v31.4s, v31.4s, v23.4s\n"
                    "str q27, [c_ptr3]\n"
                    "str q28, [c_ptr4]\n"
                    "str q29, [c_ptr5]\n"
                    "str q30, [c_ptr6]\n"
                    "str q31, [c_ptr7]\n"
                    ".unreq a_ptr1\n"
                    ".unreq a_ptr2\n"
                    ".unreq a_ptr3\n"
                    ".unreq a_ptr4\n"
                    ".unreq a_ptr5\n"
                    ".unreq a_ptr6\n"
                    ".unreq a_ptr7\n"
                    ".unreq c_ptr1\n"
                    ".unreq c_ptr2\n"
                    ".unreq c_ptr3\n"
                    ".unreq c_ptr4\n"
                    ".unreq c_ptr5\n"
                    ".unreq c_ptr6\n"
                    ".unreq c_ptr7\n"
                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [biasptr] "+r" (biasptr)
                    : [lda] "r" (ldab), [ldc] "r" (ldcb), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
                );
                break;
            case 5:
                __asm __volatile (
                    "a_ptr1 .req X0\n"
                    "a_ptr2 .req X1\n"
                    "a_ptr3 .req X2\n"
                    "a_ptr4 .req X3\n"
                    "a_ptr5 .req X4\n"
                    "a_ptr6 .req X5\n"
                    "a_ptr7 .req X6\n"
                    "c_ptr1 .req X7\n"
                    "c_ptr2 .req X8\n"
                    "c_ptr3 .req X9\n"
                    "c_ptr4 .req X10\n"
                    "c_ptr5 .req X11\n"
                    "c_ptr6 .req X12\n"
                    "c_ptr7 .req X13\n"
                    "add a_ptr1, %[a_ptr0], %[lda]\n"
                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
                    "add a_ptr2, a_ptr1, %[lda]\n"
                    "add c_ptr2, c_ptr1, %[ldc]\n"
                    "add a_ptr3, a_ptr2, %[lda]\n"
                    "add c_ptr3, c_ptr2, %[ldc]\n"
                    "add a_ptr4, a_ptr3, %[lda]\n"
                    "add c_ptr4, c_ptr3, %[ldc]\n"
                    "add a_ptr5, a_ptr4, %[lda]\n"
                    "add c_ptr5, c_ptr4, %[ldc]\n"
                    "add a_ptr6, a_ptr5, %[lda]\n"
                    "add c_ptr6, c_ptr5, %[ldc]\n"
                    "add a_ptr7, a_ptr6, %[lda]\n"
                    "add c_ptr7, c_ptr6, %[ldc]\n"
                    "cbz %[oob_rows], 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr7, %[c_ptr0], #0x0\n"
                    "add a_ptr7, %[a_ptr0], #0x0\n"
                    "b.eq 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr6, %[c_ptr0], #0x0\n"
                    "add a_ptr6, %[a_ptr0], #0x0\n"
                    "b.eq 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr5, %[c_ptr0], #0x0\n"
                    "add a_ptr5, %[a_ptr0], #0x0\n"
                    "b.eq 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr4, %[c_ptr0], #0x0\n"
                    "add a_ptr4, %[a_ptr0], #0x0\n"
                    "b.eq 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr3, %[c_ptr0], #0x0\n"
                    "add a_ptr3, %[a_ptr0], #0x0\n"
                    "b.eq 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr2, %[c_ptr0], #0x0\n"
                    "add a_ptr2, %[a_ptr0], #0x0\n"
                    "b.eq 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr1, %[c_ptr0], #0x0\n"
                    "add a_ptr1, %[a_ptr0], #0x0\n"
                    "1:\n"
                    "ldr q0, [%[a_ptr0]], #0x10\n"
                    "ldr q2, [a_ptr1], #0x10\n"
                    "ldr q4, [a_ptr2], #0x10\n"
                    "ldr q6, [a_ptr3], #0x10\n"
                    "ldr s1, [%[a_ptr0]]\n"
                    "ldr q8, [a_ptr4], #0x10\n"
                    "ldr s3, [a_ptr1]\n"
                    "ldr q10, [a_ptr5], #0x10\n"
                    "ldr s5, [a_ptr2]\n"
                    "ldr q12, [a_ptr6], #0x10\n"
                    "ldr s7, [a_ptr3]\n"
                    "ldr q14, [a_ptr7], #0x10\n"
                    "ldr s9, [a_ptr4]\n"
                    "ldr q16, [%[b_ptr0]]\n"
                    "ldr s11, [a_ptr5]\n"
                    "ldr q17, [%[b_ptr0], #0x10]\n"
                    "ldr s13, [a_ptr6]\n"
                    "ldr q18, [%[b_ptr0], #0x20]\n"
                    "ldr s15, [a_ptr7]\n"
                    "ldr q19, [%[b_ptr0], #0x30]\n"
                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                    "ldr q20, [%[b_ptr0], #0x40]\n"
                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x50\n"
                    "cbz %[loops], 2f\n"
                    "ldr q24, [%[biasptr]]\n"
                    "add %[biasptr], %[biasptr], %[biasinc]\n"
                    "subs %[loops], %[loops], #0x1\n"
                    "mov v25.16b, v24.16b\n"
                    "mov v26.16b, v24.16b\n"
                    "mov v27.16b, v24.16b\n"
                    "mov v28.16b, v24.16b\n"
                    "mov v29.16b, v24.16b\n"
                    "mov v30.16b, v24.16b\n"
                    "mov v31.16b, v24.16b\n"
                    "fmla v24.4s, v16.4s, v0.s[0]\n"
                    "fmla v25.4s, v16.4s, v2.s[0]\n"
                    "fmla v26.4s, v16.4s, v4.s[0]\n"
                    "fmla v27.4s, v16.4s, v6.s[0]\n"
                    "fmla v28.4s, v16.4s, v8.s[0]\n"
                    "fmla v29.4s, v16.4s, v10.s[0]\n"
                    "fmla v30.4s, v16.4s, v12.s[0]\n"
                    "fmla v31.4s, v16.4s, v14.s[0]\n"
                    "fmla v24.4s, v17.4s, v0.s[1]\n"
                    "fmla v25.4s, v17.4s, v2.s[1]\n"
                    "fmla v26.4s, v17.4s, v4.s[1]\n"
                    "fmla v27.4s, v17.4s, v6.s[1]\n"
                    "fmla v28.4s, v17.4s, v8.s[1]\n"
                    "fmla v29.4s, v17.4s, v10.s[1]\n"
                    "fmla v30.4s, v17.4s, v12.s[1]\n"
                    "fmla v31.4s, v17.4s, v14.s[1]\n"
                    "fmla v24.4s, v18.4s, v0.s[2]\n"
                    "fmla v25.4s, v18.4s, v2.s[2]\n"
                    "fmla v26.4s, v18.4s, v4.s[2]\n"
                    "fmla v27.4s, v18.4s, v6.s[2]\n"
                    "fmla v28.4s, v18.4s, v8.s[2]\n"
                    "fmla v29.4s, v18.4s, v10.s[2]\n"
                    "fmla v30.4s, v18.4s, v12.s[2]\n"
                    "fmla v31.4s, v18.4s, v14.s[2]\n"
                    "fmla v24.4s, v19.4s, v0.s[3]\n"
                    "fmla v25.4s, v19.4s, v2.s[3]\n"
                    "fmla v26.4s, v19.4s, v4.s[3]\n"
                    "fmla v27.4s, v19.4s, v6.s[3]\n"
                    "fmla v28.4s, v19.4s, v8.s[3]\n"
                    "fmla v29.4s, v19.4s, v10.s[3]\n"
                    "fmla v30.4s, v19.4s, v12.s[3]\n"
                    "fmla v31.4s, v19.4s, v14.s[3]\n"
                    "fmla v24.4s, v20.4s, v1.s[0]\n"
                    "fmla v25.4s, v20.4s, v3.s[0]\n"
                    "fmla v26.4s, v20.4s, v5.s[0]\n"
                    "fmla v27.4s, v20.4s, v7.s[0]\n"
                    "fmla v28.4s, v20.4s, v9.s[0]\n"
                    "fmla v29.4s, v20.4s, v11.s[0]\n"
                    "fmla v30.4s, v20.4s, v13.s[0]\n"
                    "fmla v31.4s, v20.4s, v15.s[0]\n"
                    "b.eq 3f\n"
                    "4:\n"
                    "ld1r {v22.4s}, [%[minptr]]\n"
                    "subs %[loops], %[loops], #0x1\n"
                    "ld1r {v23.4s}, [%[maxptr]]\n"
                    "ldr q16, [%[b_ptr0]]\n"
                    "fmax v24.4s, v24.4s, v22.4s\n"
                    "ldr q17, [%[b_ptr0], #0x10]\n"
                    "fmax v25.4s, v25.4s, v22.4s\n"
                    "ldr q18, [%[b_ptr0], #0x20]\n"
                    "fmax v26.4s, v26.4s, v22.4s\n"
                    "ldr q19, [%[b_ptr0], #0x30]\n"
                    "fmax v27.4s, v27.4s, v22.4s\n"
                    "ldr q20, [%[b_ptr0], #0x40]\n"
                    "fmin v24.4s, v24.4s, v23.4s\n"
                    "add %[b_ptr0], %[b_ptr0], #0x50\n"
                    "fmin v25.4s, v25.4s, v23.4s\n"
                    "fmin v26.4s, v26.4s, v23.4s\n"
                    "str q24, [%[c_ptr0]]\n"
                    "fmin v27.4s, v27.4s, v23.4s\n"
                    "ldr q24, [%[biasptr]]\n"
                    "fmax v28.4s, v28.4s, v22.4s\n"
                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
                    "fmax v29.4s, v29.4s, v22.4s\n"
                    "str q25, [c_ptr1]\n"
                    "fmax v30.4s, v30.4s, v22.4s\n"
                    "add c_ptr1, c_ptr1, #0x10\n"
                    "fmin v28.4s, v28.4s, v23.4s\n"
                    "str q26, [c_ptr2]\n"
                    "fmin v29.4s, v29.4s, v23.4s\n"
                    "add c_ptr2, c_ptr2, #0x10\n"
                    "fmin v30.4s, v30.4s, v23.4s\n"
                    "str q27, [c_ptr3]\n"
                    "fmax v31.4s, v31.4s, v22.4s\n"
                    "add c_ptr3, c_ptr3, #0x10\n"
                    "mov v25.16b, v24.16b\n"
                    "str q28, [c_ptr4]\n"
                    "mov v26.16b, v24.16b\n"
                    "add c_ptr4, c_ptr4, #0x10\n"
                    "fmin v31.4s, v31.4s, v23.4s\n"
                    "str q29, [c_ptr5]\n"
                    "mov v27.16b, v24.16b\n"
                    "add c_ptr5, c_ptr5, #0x10\n"
                    "mov v28.16b, v24.16b\n"
                    "str q30, [c_ptr6]\n"
                    "mov v29.16b, v24.16b\n"
                    "add c_ptr6, c_ptr6, #0x10\n"
                    "mov v30.16b, v24.16b\n"
                    "str q31, [c_ptr7]\n"
                    "mov v31.16b, v24.16b\n"
                    "add c_ptr7, c_ptr7, #0x10\n"
                    "fmla v24.4s, v16.4s, v0.s[0]\n"
                    "add %[biasptr], %[biasptr], %[biasinc]\n"
                    "fmla v25.4s, v16.4s, v2.s[0]\n"
                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                    "fmla v26.4s, v16.4s, v4.s[0]\n"
                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                    "fmla v27.4s, v16.4s, v6.s[0]\n"
                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                    "fmla v28.4s, v16.4s, v8.s[0]\n"
                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                    "fmla v29.4s, v16.4s, v10.s[0]\n"
                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                    "fmla v30.4s, v16.4s, v12.s[0]\n"
                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                    "fmla v31.4s, v16.4s, v14.s[0]\n"
                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                    "fmla v24.4s, v17.4s, v0.s[1]\n"
                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                    "fmla v25.4s, v17.4s, v2.s[1]\n"
                    "fmla v26.4s, v17.4s, v4.s[1]\n"
                    "fmla v27.4s, v17.4s, v6.s[1]\n"
                    "fmla v28.4s, v17.4s, v8.s[1]\n"
                    "fmla v29.4s, v17.4s, v10.s[1]\n"
                    "fmla v30.4s, v17.4s, v12.s[1]\n"
                    "fmla v31.4s, v17.4s, v14.s[1]\n"
                    "fmla v24.4s, v18.4s, v0.s[2]\n"
                    "fmla v25.4s, v18.4s, v2.s[2]\n"
                    "fmla v26.4s, v18.4s, v4.s[2]\n"
                    "fmla v27.4s, v18.4s, v6.s[2]\n"
                    "fmla v28.4s, v18.4s, v8.s[2]\n"
                    "fmla v29.4s, v18.4s, v10.s[2]\n"
                    "fmla v30.4s, v18.4s, v12.s[2]\n"
                    "fmla v31.4s, v18.4s, v14.s[2]\n"
                    "fmla v24.4s, v19.4s, v0.s[3]\n"
                    "fmla v25.4s, v19.4s, v2.s[3]\n"
                    "fmla v26.4s, v19.4s, v4.s[3]\n"
                    "fmla v27.4s, v19.4s, v6.s[3]\n"
                    "fmla v28.4s, v19.4s, v8.s[3]\n"
                    "fmla v29.4s, v19.4s, v10.s[3]\n"
                    "fmla v30.4s, v19.4s, v12.s[3]\n"
                    "fmla v31.4s, v19.4s, v14.s[3]\n"
                    "fmla v24.4s, v20.4s, v1.s[0]\n"
                    "fmla v25.4s, v20.4s, v3.s[0]\n"
                    "fmla v26.4s, v20.4s, v5.s[0]\n"
                    "fmla v27.4s, v20.4s, v7.s[0]\n"
                    "fmla v28.4s, v20.4s, v9.s[0]\n"
                    "fmla v29.4s, v20.4s, v11.s[0]\n"
                    "fmla v30.4s, v20.4s, v13.s[0]\n"
                    "fmla v31.4s, v20.4s, v15.s[0]\n"
                    "b.ne 4b\n"
                    "3:\n"
                    "ld1r {v22.4s}, [%[minptr]]\n"
                    "ld1r {v23.4s}, [%[maxptr]]\n"
                    "ldr q16, [%[b_ptr0]]\n"
                    "ldr q17, [%[b_ptr0], #0x10]\n"
                    "fmax v24.4s, v24.4s, v22.4s\n"
                    "ldr q18, [%[b_ptr0], #0x20]\n"
                    "fmax v25.4s, v25.4s, v22.4s\n"
                    "ldr q19, [%[b_ptr0], #0x30]\n"
                    "fmax v26.4s, v26.4s, v22.4s\n"
                    "ldr q20, [%[b_ptr0], #0x40]\n"
                    "fmax v27.4s, v27.4s, v22.4s\n"
                    "add %[b_ptr0], %[b_ptr0], #0x50\n"
                    "fmin v24.4s, v24.4s, v23.4s\n"
                    "fmin v25.4s, v25.4s, v23.4s\n"
                    "fmin v26.4s, v26.4s, v23.4s\n"
                    "fmin v27.4s, v27.4s, v23.4s\n"
                    "str q24, [%[c_ptr0]]\n"
                    "fmax v28.4s, v28.4s, v22.4s\n"
                    "ldr q24, [%[biasptr]]\n"
                    "fmax v29.4s, v29.4s, v22.4s\n"
                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
                    "fmax v30.4s, v30.4s, v22.4s\n"
                    "str q25, [c_ptr1]\n"
                    "fmin v28.4s, v28.4s, v23.4s\n"
                    "add c_ptr1, c_ptr1, #0x10\n"
                    "fmin v29.4s, v29.4s, v23.4s\n"
                    "str q26, [c_ptr2]\n"
                    "fmin v30.4s, v30.4s, v23.4s\n"
                    "add c_ptr2, c_ptr2, #0x10\n"
                    "fmax v31.4s, v31.4s, v22.4s\n"
                    "str q27, [c_ptr3]\n"
                    "mov v25.16b, v24.16b\n"
                    "add c_ptr3, c_ptr3, #0x10\n"
                    "mov v26.16b, v24.16b\n"
                    "str q28, [c_ptr4]\n"
                    "fmin v31.4s, v31.4s, v23.4s\n"
                    "add c_ptr4, c_ptr4, #0x10\n"
                    "mov v27.16b, v24.16b\n"
                    "str q29, [c_ptr5]\n"
                    "mov v28.16b, v24.16b\n"
                    "add c_ptr5, c_ptr5, #0x10\n"
                    "mov v29.16b, v24.16b\n"
                    "str q30, [c_ptr6]\n"
                    "mov v30.16b, v24.16b\n"
                    "add c_ptr6, c_ptr6, #0x10\n"
                    "fmla v25.4s, v16.4s, v2.s[0]\n"
                    "str q31, [c_ptr7]\n"
                    "mov v31.16b, v24.16b\n"
                    "add c_ptr7, c_ptr7, #0x10\n"
                    "fmla v24.4s, v16.4s, v0.s[0]\n"
                    "add %[biasptr], %[biasptr], %[biasinc]\n"
                    "fmla v26.4s, v16.4s, v4.s[0]\n"
                    "fmla v27.4s, v16.4s, v6.s[0]\n"
                    "fmla v28.4s, v16.4s, v8.s[0]\n"
                    "fmla v29.4s, v16.4s, v10.s[0]\n"
                    "fmla v30.4s, v16.4s, v12.s[0]\n"
                    "fmla v31.4s, v16.4s, v14.s[0]\n"
                    "fmla v24.4s, v17.4s, v0.s[1]\n"
                    "fmla v25.4s, v17.4s, v2.s[1]\n"
                    "fmla v26.4s, v17.4s, v4.s[1]\n"
                    "fmla v27.4s, v17.4s, v6.s[1]\n"
                    "fmla v28.4s, v17.4s, v8.s[1]\n"
                    "fmla v29.4s, v17.4s, v10.s[1]\n"
                    "fmla v30.4s, v17.4s, v12.s[1]\n"
                    "fmla v31.4s, v17.4s, v14.s[1]\n"
                    "fmla v24.4s, v18.4s, v0.s[2]\n"
                    "fmla v25.4s, v18.4s, v2.s[2]\n"
                    "fmla v26.4s, v18.4s, v4.s[2]\n"
                    "fmla v27.4s, v18.4s, v6.s[2]\n"
                    "fmla v28.4s, v18.4s, v8.s[2]\n"
                    "fmla v29.4s, v18.4s, v10.s[2]\n"
                    "fmla v30.4s, v18.4s, v12.s[2]\n"
                    "fmla v31.4s, v18.4s, v14.s[2]\n"
                    "fmla v24.4s, v19.4s, v0.s[3]\n"
                    "fmla v25.4s, v19.4s, v2.s[3]\n"
                    "fmla v26.4s, v19.4s, v4.s[3]\n"
                    "fmla v27.4s, v19.4s, v6.s[3]\n"
                    "fmla v28.4s, v19.4s, v8.s[3]\n"
                    "fmla v29.4s, v19.4s, v10.s[3]\n"
                    "fmla v30.4s, v19.4s, v12.s[3]\n"
                    "fmla v31.4s, v19.4s, v14.s[3]\n"
                    "fmla v24.4s, v20.4s, v1.s[0]\n"
                    "fmla v25.4s, v20.4s, v3.s[0]\n"
                    "fmla v26.4s, v20.4s, v5.s[0]\n"
                    "fmla v27.4s, v20.4s, v7.s[0]\n"
                    "fmla v28.4s, v20.4s, v9.s[0]\n"
                    "fmla v29.4s, v20.4s, v11.s[0]\n"
                    "fmla v30.4s, v20.4s, v13.s[0]\n"
                    "fmla v31.4s, v20.4s, v15.s[0]\n"
                    "b 5f\n"
                    "2:\n"
                    "ldr q24, [%[biasptr]]\n"
                    "add %[biasptr], %[biasptr], %[biasinc]\n"
                    "mov v25.16b, v24.16b\n"
                    "mov v26.16b, v24.16b\n"
                    "mov v27.16b, v24.16b\n"
                    "mov v28.16b, v24.16b\n"
                    "mov v29.16b, v24.16b\n"
                    "mov v30.16b, v24.16b\n"
                    "mov v31.16b, v24.16b\n"
                    "fmla v24.4s, v16.4s, v0.s[0]\n"
                    "fmla v25.4s, v16.4s, v2.s[0]\n"
                    "fmla v26.4s, v16.4s, v4.s[0]\n"
                    "fmla v27.4s, v16.4s, v6.s[0]\n"
                    "fmla v28.4s, v16.4s, v8.s[0]\n"
                    "fmla v29.4s, v16.4s, v10.s[0]\n"
                    "fmla v30.4s, v16.4s, v12.s[0]\n"
                    "fmla v31.4s, v16.4s, v14.s[0]\n"
                    "fmla v24.4s, v17.4s, v0.s[1]\n"
                    "fmla v25.4s, v17.4s, v2.s[1]\n"
                    "fmla v26.4s, v17.4s, v4.s[1]\n"
                    "fmla v27.4s, v17.4s, v6.s[1]\n"
                    "fmla v28.4s, v17.4s, v8.s[1]\n"
                    "fmla v29.4s, v17.4s, v10.s[1]\n"
                    "fmla v30.4s, v17.4s, v12.s[1]\n"
                    "fmla v31.4s, v17.4s, v14.s[1]\n"
                    "fmla v24.4s, v18.4s, v0.s[2]\n"
                    "fmla v25.4s, v18.4s, v2.s[2]\n"
                    "fmla v26.4s, v18.4s, v4.s[2]\n"
                    "fmla v27.4s, v18.4s, v6.s[2]\n"
                    "fmla v28.4s, v18.4s, v8.s[2]\n"
                    "fmla v29.4s, v18.4s, v10.s[2]\n"
                    "fmla v30.4s, v18.4s, v12.s[2]\n"
                    "fmla v31.4s, v18.4s, v14.s[2]\n"
                    "fmla v24.4s, v19.4s, v0.s[3]\n"
                    "fmla v25.4s, v19.4s, v2.s[3]\n"
                    "fmla v26.4s, v19.4s, v4.s[3]\n"
                    "fmla v27.4s, v19.4s, v6.s[3]\n"
                    "fmla v28.4s, v19.4s, v8.s[3]\n"
                    "fmla v29.4s, v19.4s, v10.s[3]\n"
                    "fmla v30.4s, v19.4s, v12.s[3]\n"
                    "fmla v31.4s, v19.4s, v14.s[3]\n"
                    "fmla v24.4s, v20.4s, v1.s[0]\n"
                    "fmla v25.4s, v20.4s, v3.s[0]\n"
                    "fmla v26.4s, v20.4s, v5.s[0]\n"
                    "fmla v27.4s, v20.4s, v7.s[0]\n"
                    "fmla v28.4s, v20.4s, v9.s[0]\n"
                    "fmla v29.4s, v20.4s, v11.s[0]\n"
                    "fmla v30.4s, v20.4s, v13.s[0]\n"
                    "fmla v31.4s, v20.4s, v15.s[0]\n"
                    "5:\n"
                    "ld1r {v22.4s}, [%[minptr]]\n"
                    "ld1r {v23.4s}, [%[maxptr]]\n"
                    "fmax v24.4s, v24.4s, v22.4s\n"
                    "fmax v25.4s, v25.4s, v22.4s\n"
                    "fmax v26.4s, v26.4s, v22.4s\n"
                    "fmax v27.4s, v27.4s, v22.4s\n"
                    "fmin v24.4s, v24.4s, v23.4s\n"
                    "fmin v25.4s, v25.4s, v23.4s\n"
                    "fmin v26.4s, v26.4s, v23.4s\n"
                    "fmin v27.4s, v27.4s, v23.4s\n"
                    "str q24, [%[c_ptr0]]\n"
                    "fmax v28.4s, v28.4s, v22.4s\n"
                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
                    "fmax v29.4s, v29.4s, v22.4s\n"
                    "str q25, [c_ptr1]\n"
                    "fmax v30.4s, v30.4s, v22.4s\n"
                    "fmin v28.4s, v28.4s, v23.4s\n"
                    "fmax v31.4s, v31.4s, v22.4s\n"
                    "str q26, [c_ptr2]\n"
                    "fmin v29.4s, v29.4s, v23.4s\n"
                    "fmin v30.4s, v30.4s, v23.4s\n"
                    "fmin v31.4s, v31.4s, v23.4s\n"
                    "str q27, [c_ptr3]\n"
                    "str q28, [c_ptr4]\n"
                    "str q29, [c_ptr5]\n"
                    "str q30, [c_ptr6]\n"
                    "str q31, [c_ptr7]\n"
                    ".unreq a_ptr1\n"
                    ".unreq a_ptr2\n"
                    ".unreq a_ptr3\n"
                    ".unreq a_ptr4\n"
                    ".unreq a_ptr5\n"
                    ".unreq a_ptr6\n"
                    ".unreq a_ptr7\n"
                    ".unreq c_ptr1\n"
                    ".unreq c_ptr2\n"
                    ".unreq c_ptr3\n"
                    ".unreq c_ptr4\n"
                    ".unreq c_ptr5\n"
                    ".unreq c_ptr6\n"
                    ".unreq c_ptr7\n"
                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [biasptr] "+r" (biasptr)
                    : [lda] "r" (ldab), [ldc] "r" (ldcb), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
                );
                break;
            case 6:
                __asm __volatile (
                    "a_ptr1 .req X0\n"
                    "a_ptr2 .req X1\n"
                    "a_ptr3 .req X2\n"
                    "a_ptr4 .req X3\n"
                    "a_ptr5 .req X4\n"
                    "a_ptr6 .req X5\n"
                    "a_ptr7 .req X6\n"
                    "c_ptr1 .req X7\n"
                    "c_ptr2 .req X8\n"
                    "c_ptr3 .req X9\n"
                    "c_ptr4 .req X10\n"
                    "c_ptr5 .req X11\n"
                    "c_ptr6 .req X12\n"
                    "c_ptr7 .req X13\n"
                    "add a_ptr1, %[a_ptr0], %[lda]\n"
                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
                    "add a_ptr2, a_ptr1, %[lda]\n"
                    "add c_ptr2, c_ptr1, %[ldc]\n"
                    "add a_ptr3, a_ptr2, %[lda]\n"
                    "add c_ptr3, c_ptr2, %[ldc]\n"
                    "add a_ptr4, a_ptr3, %[lda]\n"
                    "add c_ptr4, c_ptr3, %[ldc]\n"
                    "add a_ptr5, a_ptr4, %[lda]\n"
                    "add c_ptr5, c_ptr4, %[ldc]\n"
                    "add a_ptr6, a_ptr5, %[lda]\n"
                    "add c_ptr6, c_ptr5, %[ldc]\n"
                    "add a_ptr7, a_ptr6, %[lda]\n"
                    "add c_ptr7, c_ptr6, %[ldc]\n"
                    "cbz %[oob_rows], 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr7, %[c_ptr0], #0x0\n"
                    "add a_ptr7, %[a_ptr0], #0x0\n"
                    "b.eq 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr6, %[c_ptr0], #0x0\n"
                    "add a_ptr6, %[a_ptr0], #0x0\n"
                    "b.eq 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr5, %[c_ptr0], #0x0\n"
                    "add a_ptr5, %[a_ptr0], #0x0\n"
                    "b.eq 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr4, %[c_ptr0], #0x0\n"
                    "add a_ptr4, %[a_ptr0], #0x0\n"
                    "b.eq 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr3, %[c_ptr0], #0x0\n"
                    "add a_ptr3, %[a_ptr0], #0x0\n"
                    "b.eq 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr2, %[c_ptr0], #0x0\n"
                    "add a_ptr2, %[a_ptr0], #0x0\n"
                    "b.eq 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr1, %[c_ptr0], #0x0\n"
                    "add a_ptr1, %[a_ptr0], #0x0\n"
                    "1:\n"
                    "ldr q0, [%[a_ptr0]], #0x10\n"
                    "ldr q2, [a_ptr1], #0x10\n"
                    "ldr q4, [a_ptr2], #0x10\n"
                    "ldr q6, [a_ptr3], #0x10\n"
                    "ldr d1, [%[a_ptr0]]\n"
                    "ldr q8, [a_ptr4], #0x10\n"
                    "ldr d3, [a_ptr1]\n"
                    "ldr q10, [a_ptr5], #0x10\n"
                    "ldr d5, [a_ptr2]\n"
                    "ldr q12, [a_ptr6], #0x10\n"
                    "ldr d7, [a_ptr3]\n"
                    "ldr q14, [a_ptr7], #0x10\n"
                    "ldr d9, [a_ptr4]\n"
                    "ldr q16, [%[b_ptr0]]\n"
                    "ldr d11, [a_ptr5]\n"
                    "ldr q17, [%[b_ptr0], #0x10]\n"
                    "ldr d13, [a_ptr6]\n"
                    "ldr q18, [%[b_ptr0], #0x20]\n"
                    "ldr d15, [a_ptr7]\n"
                    "ldr q19, [%[b_ptr0], #0x30]\n"
                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                    "ldr q20, [%[b_ptr0], #0x40]\n"
                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                    "ldr q21, [%[b_ptr0], #0x50]\n"
                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x60\n"
                    "cbz %[loops], 2f\n"
                    "ldr q24, [%[biasptr]]\n"
                    "add %[biasptr], %[biasptr], %[biasinc]\n"
                    "subs %[loops], %[loops], #0x1\n"
                    "mov v25.16b, v24.16b\n"
                    "mov v26.16b, v24.16b\n"
                    "mov v27.16b, v24.16b\n"
                    "mov v28.16b, v24.16b\n"
                    "mov v29.16b, v24.16b\n"
                    "mov v30.16b, v24.16b\n"
                    "mov v31.16b, v24.16b\n"
                    "fmla v24.4s, v16.4s, v0.s[0]\n"
                    "fmla v25.4s, v16.4s, v2.s[0]\n"
                    "fmla v26.4s, v16.4s, v4.s[0]\n"
                    "fmla v27.4s, v16.4s, v6.s[0]\n"
                    "fmla v28.4s, v16.4s, v8.s[0]\n"
                    "fmla v29.4s, v16.4s, v10.s[0]\n"
                    "fmla v30.4s, v16.4s, v12.s[0]\n"
                    "fmla v31.4s, v16.4s, v14.s[0]\n"
                    "fmla v24.4s, v17.4s, v0.s[1]\n"
                    "fmla v25.4s, v17.4s, v2.s[1]\n"
                    "fmla v26.4s, v17.4s, v4.s[1]\n"
                    "fmla v27.4s, v17.4s, v6.s[1]\n"
                    "fmla v28.4s, v17.4s, v8.s[1]\n"
                    "fmla v29.4s, v17.4s, v10.s[1]\n"
                    "fmla v30.4s, v17.4s, v12.s[1]\n"
                    "fmla v31.4s, v17.4s, v14.s[1]\n"
                    "fmla v24.4s, v18.4s, v0.s[2]\n"
                    "fmla v25.4s, v18.4s, v2.s[2]\n"
                    "fmla v26.4s, v18.4s, v4.s[2]\n"
                    "fmla v27.4s, v18.4s, v6.s[2]\n"
                    "fmla v28.4s, v18.4s, v8.s[2]\n"
                    "fmla v29.4s, v18.4s, v10.s[2]\n"
                    "fmla v30.4s, v18.4s, v12.s[2]\n"
                    "fmla v31.4s, v18.4s, v14.s[2]\n"
                    "fmla v24.4s, v19.4s, v0.s[3]\n"
                    "fmla v25.4s, v19.4s, v2.s[3]\n"
                    "fmla v26.4s, v19.4s, v4.s[3]\n"
                    "fmla v27.4s, v19.4s, v6.s[3]\n"
                    "fmla v28.4s, v19.4s, v8.s[3]\n"
                    "fmla v29.4s, v19.4s, v10.s[3]\n"
                    "fmla v30.4s, v19.4s, v12.s[3]\n"
                    "fmla v31.4s, v19.4s, v14.s[3]\n"
                    "fmla v24.4s, v20.4s, v1.s[0]\n"
                    "fmla v25.4s, v20.4s, v3.s[0]\n"
                    "fmla v26.4s, v20.4s, v5.s[0]\n"
                    "fmla v27.4s, v20.4s, v7.s[0]\n"
                    "fmla v28.4s, v20.4s, v9.s[0]\n"
                    "fmla v29.4s, v20.4s, v11.s[0]\n"
                    "fmla v30.4s, v20.4s, v13.s[0]\n"
                    "fmla v31.4s, v20.4s, v15.s[0]\n"
                    "fmla v24.4s, v21.4s, v1.s[1]\n"
                    "fmla v25.4s, v21.4s, v3.s[1]\n"
                    "fmla v26.4s, v21.4s, v5.s[1]\n"
                    "fmla v27.4s, v21.4s, v7.s[1]\n"
                    "fmla v28.4s, v21.4s, v9.s[1]\n"
                    "fmla v29.4s, v21.4s, v11.s[1]\n"
                    "fmla v30.4s, v21.4s, v13.s[1]\n"
                    "fmla v31.4s, v21.4s, v15.s[1]\n"
                    "b.eq 3f\n"
                    "4:\n"
                    "ld1r {v22.4s}, [%[minptr]]\n"
                    "subs %[loops], %[loops], #0x1\n"
                    "ld1r {v23.4s}, [%[maxptr]]\n"
                    "ldr q16, [%[b_ptr0]]\n"
                    "fmax v24.4s, v24.4s, v22.4s\n"
                    "ldr q17, [%[b_ptr0], #0x10]\n"
                    "fmax v25.4s, v25.4s, v22.4s\n"
                    "ldr q18, [%[b_ptr0], #0x20]\n"
                    "fmax v26.4s, v26.4s, v22.4s\n"
                    "ldr q19, [%[b_ptr0], #0x30]\n"
                    "fmax v27.4s, v27.4s, v22.4s\n"
                    "ldr q20, [%[b_ptr0], #0x40]\n"
                    "fmin v24.4s, v24.4s, v23.4s\n"
                    "ldr q21, [%[b_ptr0], #0x50]\n"
                    "fmin v25.4s, v25.4s, v23.4s\n"
                    "add %[b_ptr0], %[b_ptr0], #0x60\n"
                    "fmin v26.4s, v26.4s, v23.4s\n"
                    "str q24, [%[c_ptr0]]\n"
                    "fmin v27.4s, v27.4s, v23.4s\n"
                    "ldr q24, [%[biasptr]]\n"
                    "fmax v28.4s, v28.4s, v22.4s\n"
                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
                    "fmax v29.4s, v29.4s, v22.4s\n"
                    "str q25, [c_ptr1]\n"
                    "fmax v30.4s, v30.4s, v22.4s\n"
                    "add c_ptr1, c_ptr1, #0x10\n"
                    "fmin v28.4s, v28.4s, v23.4s\n"
                    "str q26, [c_ptr2]\n"
                    "fmin v29.4s, v29.4s, v23.4s\n"
                    "add c_ptr2, c_ptr2, #0x10\n"
                    "fmin v30.4s, v30.4s, v23.4s\n"
                    "str q27, [c_ptr3]\n"
                    "fmax v31.4s, v31.4s, v22.4s\n"
                    "add c_ptr3, c_ptr3, #0x10\n"
                    "mov v25.16b, v24.16b\n"
                    "str q28, [c_ptr4]\n"
                    "mov v26.16b, v24.16b\n"
                    "add c_ptr4, c_ptr4, #0x10\n"
                    "fmin v31.4s, v31.4s, v23.4s\n"
                    "str q29, [c_ptr5]\n"
                    "mov v27.16b, v24.16b\n"
                    "add c_ptr5, c_ptr5, #0x10\n"
                    "mov v28.16b, v24.16b\n"
                    "str q30, [c_ptr6]\n"
                    "mov v29.16b, v24.16b\n"
                    "add c_ptr6, c_ptr6, #0x10\n"
                    "mov v30.16b, v24.16b\n"
                    "str q31, [c_ptr7]\n"
                    "mov v31.16b, v24.16b\n"
                    "add c_ptr7, c_ptr7, #0x10\n"
                    "fmla v24.4s, v16.4s, v0.s[0]\n"
                    "add %[biasptr], %[biasptr], %[biasinc]\n"
                    "fmla v25.4s, v16.4s, v2.s[0]\n"
                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                    "fmla v26.4s, v16.4s, v4.s[0]\n"
                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                    "fmla v27.4s, v16.4s, v6.s[0]\n"
                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                    "fmla v28.4s, v16.4s, v8.s[0]\n"
                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                    "fmla v29.4s, v16.4s, v10.s[0]\n"
                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                    "fmla v30.4s, v16.4s, v12.s[0]\n"
                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                    "fmla v31.4s, v16.4s, v14.s[0]\n"
                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                    "fmla v24.4s, v17.4s, v0.s[1]\n"
                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                    "fmla v25.4s, v17.4s, v2.s[1]\n"
                    "fmla v26.4s, v17.4s, v4.s[1]\n"
                    "fmla v27.4s, v17.4s, v6.s[1]\n"
                    "fmla v28.4s, v17.4s, v8.s[1]\n"
                    "fmla v29.4s, v17.4s, v10.s[1]\n"
                    "fmla v30.4s, v17.4s, v12.s[1]\n"
                    "fmla v31.4s, v17.4s, v14.s[1]\n"
                    "fmla v24.4s, v18.4s, v0.s[2]\n"
                    "fmla v25.4s, v18.4s, v2.s[2]\n"
                    "fmla v26.4s, v18.4s, v4.s[2]\n"
                    "fmla v27.4s, v18.4s, v6.s[2]\n"
                    "fmla v28.4s, v18.4s, v8.s[2]\n"
                    "fmla v29.4s, v18.4s, v10.s[2]\n"
                    "fmla v30.4s, v18.4s, v12.s[2]\n"
                    "fmla v31.4s, v18.4s, v14.s[2]\n"
                    "fmla v24.4s, v19.4s, v0.s[3]\n"
                    "fmla v25.4s, v19.4s, v2.s[3]\n"
                    "fmla v26.4s, v19.4s, v4.s[3]\n"
                    "fmla v27.4s, v19.4s, v6.s[3]\n"
                    "fmla v28.4s, v19.4s, v8.s[3]\n"
                    "fmla v29.4s, v19.4s, v10.s[3]\n"
                    "fmla v30.4s, v19.4s, v12.s[3]\n"
                    "fmla v31.4s, v19.4s, v14.s[3]\n"
                    "fmla v24.4s, v20.4s, v1.s[0]\n"
                    "fmla v25.4s, v20.4s, v3.s[0]\n"
                    "fmla v26.4s, v20.4s, v5.s[0]\n"
                    "fmla v27.4s, v20.4s, v7.s[0]\n"
                    "fmla v28.4s, v20.4s, v9.s[0]\n"
                    "fmla v29.4s, v20.4s, v11.s[0]\n"
                    "fmla v30.4s, v20.4s, v13.s[0]\n"
                    "fmla v31.4s, v20.4s, v15.s[0]\n"
                    "fmla v24.4s, v21.4s, v1.s[1]\n"
                    "fmla v25.4s, v21.4s, v3.s[1]\n"
                    "fmla v26.4s, v21.4s, v5.s[1]\n"
                    "fmla v27.4s, v21.4s, v7.s[1]\n"
                    "fmla v28.4s, v21.4s, v9.s[1]\n"
                    "fmla v29.4s, v21.4s, v11.s[1]\n"
                    "fmla v30.4s, v21.4s, v13.s[1]\n"
                    "fmla v31.4s, v21.4s, v15.s[1]\n"
                    "b.ne 4b\n"
                    "3:\n"
                    "ld1r {v22.4s}, [%[minptr]]\n"
                    "ld1r {v23.4s}, [%[maxptr]]\n"
                    "ldr q16, [%[b_ptr0]]\n"
                    "ldr q17, [%[b_ptr0], #0x10]\n"
                    "fmax v24.4s, v24.4s, v22.4s\n"
                    "ldr q18, [%[b_ptr0], #0x20]\n"
                    "fmax v25.4s, v25.4s, v22.4s\n"
                    "ldr q19, [%[b_ptr0], #0x30]\n"
                    "fmax v26.4s, v26.4s, v22.4s\n"
                    "ldr q20, [%[b_ptr0], #0x40]\n"
                    "fmax v27.4s, v27.4s, v22.4s\n"
                    "ldr q21, [%[b_ptr0], #0x50]\n"
                    "fmin v24.4s, v24.4s, v23.4s\n"
                    "add %[b_ptr0], %[b_ptr0], #0x60\n"
                    "fmin v25.4s, v25.4s, v23.4s\n"
                    "fmin v26.4s, v26.4s, v23.4s\n"
                    "str q24, [%[c_ptr0]]\n"
                    "fmin v27.4s, v27.4s, v23.4s\n"
                    "ldr q24, [%[biasptr]]\n"
                    "fmax v28.4s, v28.4s, v22.4s\n"
                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
                    "fmax v29.4s, v29.4s, v22.4s\n"
                    "str q25, [c_ptr1]\n"
                    "fmax v30.4s, v30.4s, v22.4s\n"
                    "add c_ptr1, c_ptr1, #0x10\n"
                    "fmin v28.4s, v28.4s, v23.4s\n"
                    "str q26, [c_ptr2]\n"
                    "fmin v29.4s, v29.4s, v23.4s\n"
                    "add c_ptr2, c_ptr2, #0x10\n"
                    "fmin v30.4s, v30.4s, v23.4s\n"
                    "str q27, [c_ptr3]\n"
                    "fmax v31.4s, v31.4s, v22.4s\n"
                    "add c_ptr3, c_ptr3, #0x10\n"
                    "mov v25.16b, v24.16b\n"
                    "str q28, [c_ptr4]\n"
                    "mov v26.16b, v24.16b\n"
                    "add c_ptr4, c_ptr4, #0x10\n"
                    "fmin v31.4s, v31.4s, v23.4s\n"
                    "str q29, [c_ptr5]\n"
                    "mov v27.16b, v24.16b\n"
                    "add c_ptr5, c_ptr5, #0x10\n"
                    "mov v28.16b, v24.16b\n"
                    "str q30, [c_ptr6]\n"
                    "mov v29.16b, v24.16b\n"
                    "add c_ptr6, c_ptr6, #0x10\n"
                    "mov v30.16b, v24.16b\n"
                    "str q31, [c_ptr7]\n"
                    "mov v31.16b, v24.16b\n"
                    "add c_ptr7, c_ptr7, #0x10\n"
                    "fmla v24.4s, v16.4s, v0.s[0]\n"
                    "add %[biasptr], %[biasptr], %[biasinc]\n"
                    "fmla v25.4s, v16.4s, v2.s[0]\n"
                    "fmla v26.4s, v16.4s, v4.s[0]\n"
                    "fmla v27.4s, v16.4s, v6.s[0]\n"
                    "fmla v28.4s, v16.4s, v8.s[0]\n"
                    "fmla v29.4s, v16.4s, v10.s[0]\n"
                    "fmla v30.4s, v16.4s, v12.s[0]\n"
                    "fmla v31.4s, v16.4s, v14.s[0]\n"
                    "fmla v24.4s, v17.4s, v0.s[1]\n"
                    "fmla v25.4s, v17.4s, v2.s[1]\n"
                    "fmla v26.4s, v17.4s, v4.s[1]\n"
                    "fmla v27.4s, v17.4s, v6.s[1]\n"
                    "fmla v28.4s, v17.4s, v8.s[1]\n"
                    "fmla v29.4s, v17.4s, v10.s[1]\n"
                    "fmla v30.4s, v17.4s, v12.s[1]\n"
                    "fmla v31.4s, v17.4s, v14.s[1]\n"
                    "fmla v24.4s, v18.4s, v0.s[2]\n"
                    "fmla v25.4s, v18.4s, v2.s[2]\n"
                    "fmla v26.4s, v18.4s, v4.s[2]\n"
                    "fmla v27.4s, v18.4s, v6.s[2]\n"
                    "fmla v28.4s, v18.4s, v8.s[2]\n"
                    "fmla v29.4s, v18.4s, v10.s[2]\n"
                    "fmla v30.4s, v18.4s, v12.s[2]\n"
                    "fmla v31.4s, v18.4s, v14.s[2]\n"
                    "fmla v24.4s, v19.4s, v0.s[3]\n"
                    "fmla v25.4s, v19.4s, v2.s[3]\n"
                    "fmla v26.4s, v19.4s, v4.s[3]\n"
                    "fmla v27.4s, v19.4s, v6.s[3]\n"
                    "fmla v28.4s, v19.4s, v8.s[3]\n"
                    "fmla v29.4s, v19.4s, v10.s[3]\n"
                    "fmla v30.4s, v19.4s, v12.s[3]\n"
                    "fmla v31.4s, v19.4s, v14.s[3]\n"
                    "fmla v24.4s, v20.4s, v1.s[0]\n"
                    "fmla v25.4s, v20.4s, v3.s[0]\n"
                    "fmla v26.4s, v20.4s, v5.s[0]\n"
                    "fmla v27.4s, v20.4s, v7.s[0]\n"
                    "fmla v28.4s, v20.4s, v9.s[0]\n"
                    "fmla v29.4s, v20.4s, v11.s[0]\n"
                    "fmla v30.4s, v20.4s, v13.s[0]\n"
                    "fmla v31.4s, v20.4s, v15.s[0]\n"
                    "fmla v24.4s, v21.4s, v1.s[1]\n"
                    "fmla v25.4s, v21.4s, v3.s[1]\n"
                    "fmla v26.4s, v21.4s, v5.s[1]\n"
                    "fmla v27.4s, v21.4s, v7.s[1]\n"
                    "fmla v28.4s, v21.4s, v9.s[1]\n"
                    "fmla v29.4s, v21.4s, v11.s[1]\n"
                    "fmla v30.4s, v21.4s, v13.s[1]\n"
                    "fmla v31.4s, v21.4s, v15.s[1]\n"
                    "b 5f\n"
                    "2:\n"
                    "ldr q24, [%[biasptr]]\n"
                    "add %[biasptr], %[biasptr], %[biasinc]\n"
                    "mov v25.16b, v24.16b\n"
                    "mov v26.16b, v24.16b\n"
                    "mov v27.16b, v24.16b\n"
                    "mov v28.16b, v24.16b\n"
                    "mov v29.16b, v24.16b\n"
                    "mov v30.16b, v24.16b\n"
                    "mov v31.16b, v24.16b\n"
                    "fmla v24.4s, v16.4s, v0.s[0]\n"
                    "fmla v25.4s, v16.4s, v2.s[0]\n"
                    "fmla v26.4s, v16.4s, v4.s[0]\n"
                    "fmla v27.4s, v16.4s, v6.s[0]\n"
                    "fmla v28.4s, v16.4s, v8.s[0]\n"
                    "fmla v29.4s, v16.4s, v10.s[0]\n"
                    "fmla v30.4s, v16.4s, v12.s[0]\n"
                    "fmla v31.4s, v16.4s, v14.s[0]\n"
                    "fmla v24.4s, v17.4s, v0.s[1]\n"
                    "fmla v25.4s, v17.4s, v2.s[1]\n"
                    "fmla v26.4s, v17.4s, v4.s[1]\n"
                    "fmla v27.4s, v17.4s, v6.s[1]\n"
                    "fmla v28.4s, v17.4s, v8.s[1]\n"
                    "fmla v29.4s, v17.4s, v10.s[1]\n"
                    "fmla v30.4s, v17.4s, v12.s[1]\n"
                    "fmla v31.4s, v17.4s, v14.s[1]\n"
                    "fmla v24.4s, v18.4s, v0.s[2]\n"
                    "fmla v25.4s, v18.4s, v2.s[2]\n"
                    "fmla v26.4s, v18.4s, v4.s[2]\n"
                    "fmla v27.4s, v18.4s, v6.s[2]\n"
                    "fmla v28.4s, v18.4s, v8.s[2]\n"
                    "fmla v29.4s, v18.4s, v10.s[2]\n"
                    "fmla v30.4s, v18.4s, v12.s[2]\n"
                    "fmla v31.4s, v18.4s, v14.s[2]\n"
                    "fmla v24.4s, v19.4s, v0.s[3]\n"
                    "fmla v25.4s, v19.4s, v2.s[3]\n"
                    "fmla v26.4s, v19.4s, v4.s[3]\n"
                    "fmla v27.4s, v19.4s, v6.s[3]\n"
                    "fmla v28.4s, v19.4s, v8.s[3]\n"
                    "fmla v29.4s, v19.4s, v10.s[3]\n"
                    "fmla v30.4s, v19.4s, v12.s[3]\n"
                    "fmla v31.4s, v19.4s, v14.s[3]\n"
                    "fmla v24.4s, v20.4s, v1.s[0]\n"
                    "fmla v25.4s, v20.4s, v3.s[0]\n"
                    "fmla v26.4s, v20.4s, v5.s[0]\n"
                    "fmla v27.4s, v20.4s, v7.s[0]\n"
                    "fmla v28.4s, v20.4s, v9.s[0]\n"
                    "fmla v29.4s, v20.4s, v11.s[0]\n"
                    "fmla v30.4s, v20.4s, v13.s[0]\n"
                    "fmla v31.4s, v20.4s, v15.s[0]\n"
                    "fmla v24.4s, v21.4s, v1.s[1]\n"
                    "fmla v25.4s, v21.4s, v3.s[1]\n"
                    "fmla v26.4s, v21.4s, v5.s[1]\n"
                    "fmla v27.4s, v21.4s, v7.s[1]\n"
                    "fmla v28.4s, v21.4s, v9.s[1]\n"
                    "fmla v29.4s, v21.4s, v11.s[1]\n"
                    "fmla v30.4s, v21.4s, v13.s[1]\n"
                    "fmla v31.4s, v21.4s, v15.s[1]\n"
                    "5:\n"
                    "ld1r {v22.4s}, [%[minptr]]\n"
                    "ld1r {v23.4s}, [%[maxptr]]\n"
                    "fmax v24.4s, v24.4s, v22.4s\n"
                    "fmax v25.4s, v25.4s, v22.4s\n"
                    "fmax v26.4s, v26.4s, v22.4s\n"
                    "fmax v27.4s, v27.4s, v22.4s\n"
                    "fmin v24.4s, v24.4s, v23.4s\n"
                    "fmin v25.4s, v25.4s, v23.4s\n"
                    "fmin v26.4s, v26.4s, v23.4s\n"
                    "fmin v27.4s, v27.4s, v23.4s\n"
                    "str q24, [%[c_ptr0]]\n"
                    "fmax v28.4s, v28.4s, v22.4s\n"
                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
                    "fmax v29.4s, v29.4s, v22.4s\n"
                    "str q25, [c_ptr1]\n"
                    "fmax v30.4s, v30.4s, v22.4s\n"
                    "fmin v28.4s, v28.4s, v23.4s\n"
                    "fmax v31.4s, v31.4s, v22.4s\n"
                    "str q26, [c_ptr2]\n"
                    "fmin v29.4s, v29.4s, v23.4s\n"
                    "fmin v30.4s, v30.4s, v23.4s\n"
                    "fmin v31.4s, v31.4s, v23.4s\n"
                    "str q27, [c_ptr3]\n"
                    "str q28, [c_ptr4]\n"
                    "str q29, [c_ptr5]\n"
                    "str q30, [c_ptr6]\n"
                    "str q31, [c_ptr7]\n"
                    ".unreq a_ptr1\n"
                    ".unreq a_ptr2\n"
                    ".unreq a_ptr3\n"
                    ".unreq a_ptr4\n"
                    ".unreq a_ptr5\n"
                    ".unreq a_ptr6\n"
                    ".unreq a_ptr7\n"
                    ".unreq c_ptr1\n"
                    ".unreq c_ptr2\n"
                    ".unreq c_ptr3\n"
                    ".unreq c_ptr4\n"
                    ".unreq c_ptr5\n"
                    ".unreq c_ptr6\n"
                    ".unreq c_ptr7\n"
                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [biasptr] "+r" (biasptr)
                    : [lda] "r" (ldab), [ldc] "r" (ldcb), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
                );
                break;
            case 7:
                __asm __volatile (
                    "a_ptr1 .req X0\n"
                    "a_ptr2 .req X1\n"
                    "a_ptr3 .req X2\n"
                    "a_ptr4 .req X3\n"
                    "a_ptr5 .req X4\n"
                    "a_ptr6 .req X5\n"
                    "a_ptr7 .req X6\n"
                    "c_ptr1 .req X7\n"
                    "c_ptr2 .req X8\n"
                    "c_ptr3 .req X9\n"
                    "c_ptr4 .req X10\n"
                    "c_ptr5 .req X11\n"
                    "c_ptr6 .req X12\n"
                    "c_ptr7 .req X13\n"
                    "add a_ptr1, %[a_ptr0], %[lda]\n"
                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
                    "add a_ptr2, a_ptr1, %[lda]\n"
                    "add c_ptr2, c_ptr1, %[ldc]\n"
                    "add a_ptr3, a_ptr2, %[lda]\n"
                    "add c_ptr3, c_ptr2, %[ldc]\n"
                    "add a_ptr4, a_ptr3, %[lda]\n"
                    "add c_ptr4, c_ptr3, %[ldc]\n"
                    "add a_ptr5, a_ptr4, %[lda]\n"
                    "add c_ptr5, c_ptr4, %[ldc]\n"
                    "add a_ptr6, a_ptr5, %[lda]\n"
                    "add c_ptr6, c_ptr5, %[ldc]\n"
                    "add a_ptr7, a_ptr6, %[lda]\n"
                    "add c_ptr7, c_ptr6, %[ldc]\n"
                    "cbz %[oob_rows], 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr7, %[c_ptr0], #0x0\n"
                    "add a_ptr7, %[a_ptr0], #0x0\n"
                    "b.eq 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr6, %[c_ptr0], #0x0\n"
                    "add a_ptr6, %[a_ptr0], #0x0\n"
                    "b.eq 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr5, %[c_ptr0], #0x0\n"
                    "add a_ptr5, %[a_ptr0], #0x0\n"
                    "b.eq 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr4, %[c_ptr0], #0x0\n"
                    "add a_ptr4, %[a_ptr0], #0x0\n"
                    "b.eq 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr3, %[c_ptr0], #0x0\n"
                    "add a_ptr3, %[a_ptr0], #0x0\n"
                    "b.eq 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr2, %[c_ptr0], #0x0\n"
                    "add a_ptr2, %[a_ptr0], #0x0\n"
                    "b.eq 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr1, %[c_ptr0], #0x0\n"
                    "add a_ptr1, %[a_ptr0], #0x0\n"
                    "1:\n"
                    "ldr q0, [%[a_ptr0]], #0x10\n"
                    "ldr q2, [a_ptr1], #0x10\n"
                    "ldr q4, [a_ptr2], #0x10\n"
                    "ldr q6, [a_ptr3], #0x10\n"
                    "ldr d1, [%[a_ptr0]], #0x8\n"
                    "ldr q8, [a_ptr4], #0x10\n"
                    "ldr d3, [a_ptr1], #0x8\n"
                    "ldr q10, [a_ptr5], #0x10\n"
                    "ldr d5, [a_ptr2], #0x8\n"
                    "ldr q12, [a_ptr6], #0x10\n"
                    "ldr d7, [a_ptr3], #0x8\n"
                    "ldr q14, [a_ptr7], #0x10\n"
                    "ldr d9, [a_ptr4], #0x8\n"
                    "ldr q16, [%[b_ptr0]]\n"
                    "ldr d11, [a_ptr5], #0x8\n"
                    "ldr q17, [%[b_ptr0], #0x10]\n"
                    "ldr d13, [a_ptr6], #0x8\n"
                    "ldr q18, [%[b_ptr0], #0x20]\n"
                    "ldr d15, [a_ptr7], #0x8\n"
                    "ldr q19, [%[b_ptr0], #0x30]\n"
                    "ld1 {v1.s}[2], [%[a_ptr0]]\n"
                    "ldr q20, [%[b_ptr0], #0x40]\n"
                    "ld1 {v3.s}[2], [a_ptr1]\n"
                    "ldr q21, [%[b_ptr0], #0x50]\n"
                    "ld1 {v5.s}[2], [a_ptr2]\n"
                    "ldr q22, [%[b_ptr0], #0x60]\n"
                    "ld1 {v7.s}[2], [a_ptr3]\n"
                    "ld1 {v9.s}[2], [a_ptr4]\n"
                    "ld1 {v11.s}[2], [a_ptr5]\n"
                    "ld1 {v13.s}[2], [a_ptr6]\n"
                    "ld1 {v15.s}[2], [a_ptr7]\n"
                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x70\n"
                    "cbz %[loops], 2f\n"
                    "ldr q24, [%[biasptr]]\n"
                    "add %[biasptr], %[biasptr], %[biasinc]\n"
                    "subs %[loops], %[loops], #0x1\n"
                    "mov v25.16b, v24.16b\n"
                    "mov v26.16b, v24.16b\n"
                    "mov v27.16b, v24.16b\n"
                    "mov v28.16b, v24.16b\n"
                    "mov v29.16b, v24.16b\n"
                    "mov v30.16b, v24.16b\n"
                    "mov v31.16b, v24.16b\n"
                    "fmla v24.4s, v16.4s, v0.s[0]\n"
                    "fmla v25.4s, v16.4s, v2.s[0]\n"
                    "fmla v26.4s, v16.4s, v4.s[0]\n"
                    "fmla v27.4s, v16.4s, v6.s[0]\n"
                    "fmla v28.4s, v16.4s, v8.s[0]\n"
                    "fmla v29.4s, v16.4s, v10.s[0]\n"
                    "fmla v30.4s, v16.4s, v12.s[0]\n"
                    "fmla v31.4s, v16.4s, v14.s[0]\n"
                    "fmla v24.4s, v17.4s, v0.s[1]\n"
                    "fmla v25.4s, v17.4s, v2.s[1]\n"
                    "fmla v26.4s, v17.4s, v4.s[1]\n"
                    "fmla v27.4s, v17.4s, v6.s[1]\n"
                    "fmla v28.4s, v17.4s, v8.s[1]\n"
                    "fmla v29.4s, v17.4s, v10.s[1]\n"
                    "fmla v30.4s, v17.4s, v12.s[1]\n"
                    "fmla v31.4s, v17.4s, v14.s[1]\n"
                    "fmla v24.4s, v18.4s, v0.s[2]\n"
                    "fmla v25.4s, v18.4s, v2.s[2]\n"
                    "fmla v26.4s, v18.4s, v4.s[2]\n"
                    "fmla v27.4s, v18.4s, v6.s[2]\n"
                    "fmla v28.4s, v18.4s, v8.s[2]\n"
                    "fmla v29.4s, v18.4s, v10.s[2]\n"
                    "fmla v30.4s, v18.4s, v12.s[2]\n"
                    "fmla v31.4s, v18.4s, v14.s[2]\n"
                    "fmla v24.4s, v19.4s, v0.s[3]\n"
                    "fmla v25.4s, v19.4s, v2.s[3]\n"
                    "fmla v26.4s, v19.4s, v4.s[3]\n"
                    "fmla v27.4s, v19.4s, v6.s[3]\n"
                    "fmla v28.4s, v19.4s, v8.s[3]\n"
                    "fmla v29.4s, v19.4s, v10.s[3]\n"
                    "fmla v30.4s, v19.4s, v12.s[3]\n"
                    "fmla v31.4s, v19.4s, v14.s[3]\n"
                    "fmla v24.4s, v20.4s, v1.s[0]\n"
                    "fmla v25.4s, v20.4s, v3.s[0]\n"
                    "fmla v26.4s, v20.4s, v5.s[0]\n"
                    "fmla v27.4s, v20.4s, v7.s[0]\n"
                    "fmla v28.4s, v20.4s, v9.s[0]\n"
                    "fmla v29.4s, v20.4s, v11.s[0]\n"
                    "fmla v30.4s, v20.4s, v13.s[0]\n"
                    "fmla v31.4s, v20.4s, v15.s[0]\n"
                    "fmla v24.4s, v21.4s, v1.s[1]\n"
                    "fmla v25.4s, v21.4s, v3.s[1]\n"
                    "fmla v26.4s, v21.4s, v5.s[1]\n"
                    "fmla v27.4s, v21.4s, v7.s[1]\n"
                    "fmla v28.4s, v21.4s, v9.s[1]\n"
                    "fmla v29.4s, v21.4s, v11.s[1]\n"
                    "fmla v30.4s, v21.4s, v13.s[1]\n"
                    "fmla v31.4s, v21.4s, v15.s[1]\n"
                    "fmla v24.4s, v22.4s, v1.s[2]\n"
                    "fmla v25.4s, v22.4s, v3.s[2]\n"
                    "fmla v26.4s, v22.4s, v5.s[2]\n"
                    "fmla v27.4s, v22.4s, v7.s[2]\n"
                    "fmla v28.4s, v22.4s, v9.s[2]\n"
                    "fmla v29.4s, v22.4s, v11.s[2]\n"
                    "fmla v30.4s, v22.4s, v13.s[2]\n"
                    "fmla v31.4s, v22.4s, v15.s[2]\n"
                    "b.eq 3f\n"
                    "4:\n"
                    "ld1r {v22.4s}, [%[minptr]]\n"
                    "subs %[loops], %[loops], #0x1\n"
                    "ld1r {v23.4s}, [%[maxptr]]\n"
                    "ldr q16, [%[b_ptr0]]\n"
                    "fmax v24.4s, v24.4s, v22.4s\n"
                    "ldr q17, [%[b_ptr0], #0x10]\n"
                    "fmax v25.4s, v25.4s, v22.4s\n"
                    "ldr q18, [%[b_ptr0], #0x20]\n"
                    "fmax v26.4s, v26.4s, v22.4s\n"
                    "ldr q19, [%[b_ptr0], #0x30]\n"
                    "fmax v27.4s, v27.4s, v22.4s\n"
                    "ldr q20, [%[b_ptr0], #0x40]\n"
                    "fmin v24.4s, v24.4s, v23.4s\n"
                    "ldr q21, [%[b_ptr0], #0x50]\n"
                    "fmin v25.4s, v25.4s, v23.4s\n"
                    "fmin v26.4s, v26.4s, v23.4s\n"
                    "fmin v27.4s, v27.4s, v23.4s\n"
                    "str q24, [%[c_ptr0]]\n"
                    "fmax v28.4s, v28.4s, v22.4s\n"
                    "ldr q24, [%[biasptr]]\n"
                    "fmax v29.4s, v29.4s, v22.4s\n"
                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
                    "fmax v30.4s, v30.4s, v22.4s\n"
                    "str q25, [c_ptr1]\n"
                    "fmin v28.4s, v28.4s, v23.4s\n"
                    "add c_ptr1, c_ptr1, #0x10\n"
                    "fmin v29.4s, v29.4s, v23.4s\n"
                    "str q26, [c_ptr2]\n"
                    "fmin v30.4s, v30.4s, v23.4s\n"
                    "add c_ptr2, c_ptr2, #0x10\n"
                    "fmax v31.4s, v31.4s, v22.4s\n"
                    "str q27, [c_ptr3]\n"
                    "mov v25.16b, v24.16b\n"
                    "ldr q22, [%[b_ptr0], #0x60]\n"
                    "mov v26.16b, v24.16b\n"
                    "add c_ptr3, c_ptr3, #0x10\n"
                    "fmin v31.4s, v31.4s, v23.4s\n"
                    "str q28, [c_ptr4]\n"
                    "mov v27.16b, v24.16b\n"
                    "add c_ptr4, c_ptr4, #0x10\n"
                    "mov v28.16b, v24.16b\n"
                    "str q29, [c_ptr5]\n"
                    "mov v29.16b, v24.16b\n"
                    "add c_ptr5, c_ptr5, #0x10\n"
                    "fmla v25.4s, v16.4s, v2.s[0]\n"
                    "str q30, [c_ptr6]\n"
                    "mov v30.16b, v24.16b\n"
                    "add c_ptr6, c_ptr6, #0x10\n"
                    "fmla v26.4s, v16.4s, v4.s[0]\n"
                    "str q31, [c_ptr7]\n"
                    "mov v31.16b, v24.16b\n"
                    "add c_ptr7, c_ptr7, #0x10\n"
                    "fmla v24.4s, v16.4s, v0.s[0]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x70\n"
                    "fmla v27.4s, v16.4s, v6.s[0]\n"
                    "add %[biasptr], %[biasptr], %[biasinc]\n"
                    "fmla v28.4s, v16.4s, v8.s[0]\n"
                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                    "fmla v29.4s, v16.4s, v10.s[0]\n"
                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                    "fmla v30.4s, v16.4s, v12.s[0]\n"
                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                    "fmla v31.4s, v16.4s, v14.s[0]\n"
                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                    "fmla v24.4s, v17.4s, v0.s[1]\n"
                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                    "fmla v25.4s, v17.4s, v2.s[1]\n"
                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                    "fmla v26.4s, v17.4s, v4.s[1]\n"
                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                    "fmla v27.4s, v17.4s, v6.s[1]\n"
                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                    "fmla v28.4s, v17.4s, v8.s[1]\n"
                    "fmla v29.4s, v17.4s, v10.s[1]\n"
                    "fmla v30.4s, v17.4s, v12.s[1]\n"
                    "fmla v31.4s, v17.4s, v14.s[1]\n"
                    "fmla v24.4s, v18.4s, v0.s[2]\n"
                    "fmla v25.4s, v18.4s, v2.s[2]\n"
                    "fmla v26.4s, v18.4s, v4.s[2]\n"
                    "fmla v27.4s, v18.4s, v6.s[2]\n"
                    "fmla v28.4s, v18.4s, v8.s[2]\n"
                    "fmla v29.4s, v18.4s, v10.s[2]\n"
                    "fmla v30.4s, v18.4s, v12.s[2]\n"
                    "fmla v31.4s, v18.4s, v14.s[2]\n"
                    "fmla v24.4s, v19.4s, v0.s[3]\n"
                    "fmla v25.4s, v19.4s, v2.s[3]\n"
                    "fmla v26.4s, v19.4s, v4.s[3]\n"
                    "fmla v27.4s, v19.4s, v6.s[3]\n"
                    "fmla v28.4s, v19.4s, v8.s[3]\n"
                    "fmla v29.4s, v19.4s, v10.s[3]\n"
                    "fmla v30.4s, v19.4s, v12.s[3]\n"
                    "fmla v31.4s, v19.4s, v14.s[3]\n"
                    "fmla v24.4s, v20.4s, v1.s[0]\n"
                    "fmla v25.4s, v20.4s, v3.s[0]\n"
                    "fmla v26.4s, v20.4s, v5.s[0]\n"
                    "fmla v27.4s, v20.4s, v7.s[0]\n"
                    "fmla v28.4s, v20.4s, v9.s[0]\n"
                    "fmla v29.4s, v20.4s, v11.s[0]\n"
                    "fmla v30.4s, v20.4s, v13.s[0]\n"
                    "fmla v31.4s, v20.4s, v15.s[0]\n"
                    "fmla v24.4s, v21.4s, v1.s[1]\n"
                    "fmla v25.4s, v21.4s, v3.s[1]\n"
                    "fmla v26.4s, v21.4s, v5.s[1]\n"
                    "fmla v27.4s, v21.4s, v7.s[1]\n"
                    "fmla v28.4s, v21.4s, v9.s[1]\n"
                    "fmla v29.4s, v21.4s, v11.s[1]\n"
                    "fmla v30.4s, v21.4s, v13.s[1]\n"
                    "fmla v31.4s, v21.4s, v15.s[1]\n"
                    "fmla v24.4s, v22.4s, v1.s[2]\n"
                    "fmla v25.4s, v22.4s, v3.s[2]\n"
                    "fmla v26.4s, v22.4s, v5.s[2]\n"
                    "fmla v27.4s, v22.4s, v7.s[2]\n"
                    "fmla v28.4s, v22.4s, v9.s[2]\n"
                    "fmla v29.4s, v22.4s, v11.s[2]\n"
                    "fmla v30.4s, v22.4s, v13.s[2]\n"
                    "fmla v31.4s, v22.4s, v15.s[2]\n"
                    "b.ne 4b\n"
                    "3:\n"
                    "ld1r {v22.4s}, [%[minptr]]\n"
                    "ld1r {v23.4s}, [%[maxptr]]\n"
                    "ldr q16, [%[b_ptr0]]\n"
                    "ldr q17, [%[b_ptr0], #0x10]\n"
                    "fmax v24.4s, v24.4s, v22.4s\n"
                    "ldr q18, [%[b_ptr0], #0x20]\n"
                    "fmax v25.4s, v25.4s, v22.4s\n"
                    "ldr q19, [%[b_ptr0], #0x30]\n"
                    "fmax v26.4s, v26.4s, v22.4s\n"
                    "ldr q20, [%[b_ptr0], #0x40]\n"
                    "fmax v27.4s, v27.4s, v22.4s\n"
                    "ldr q21, [%[b_ptr0], #0x50]\n"
                    "fmin v24.4s, v24.4s, v23.4s\n"
                    "fmin v25.4s, v25.4s, v23.4s\n"
                    "fmin v26.4s, v26.4s, v23.4s\n"
                    "fmin v27.4s, v27.4s, v23.4s\n"
                    "str q24, [%[c_ptr0]]\n"
                    "fmax v28.4s, v28.4s, v22.4s\n"
                    "ldr q24, [%[biasptr]]\n"
                    "fmax v29.4s, v29.4s, v22.4s\n"
                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
                    "fmax v30.4s, v30.4s, v22.4s\n"
                    "str q25, [c_ptr1]\n"
                    "fmin v28.4s, v28.4s, v23.4s\n"
                    "add c_ptr1, c_ptr1, #0x10\n"
                    "fmin v29.4s, v29.4s, v23.4s\n"
                    "str q26, [c_ptr2]\n"
                    "fmin v30.4s, v30.4s, v23.4s\n"
                    "add c_ptr2, c_ptr2, #0x10\n"
                    "fmax v31.4s, v31.4s, v22.4s\n"
                    "str q27, [c_ptr3]\n"
                    "mov v25.16b, v24.16b\n"
                    "ldr q22, [%[b_ptr0], #0x60]\n"
                    "mov v26.16b, v24.16b\n"
                    "add c_ptr3, c_ptr3, #0x10\n"
                    "fmin v31.4s, v31.4s, v23.4s\n"
                    "str q28, [c_ptr4]\n"
                    "mov v27.16b, v24.16b\n"
                    "add c_ptr4, c_ptr4, #0x10\n"
                    "mov v28.16b, v24.16b\n"
                    "str q29, [c_ptr5]\n"
                    "mov v29.16b, v24.16b\n"
                    "add c_ptr5, c_ptr5, #0x10\n"
                    "fmla v25.4s, v16.4s, v2.s[0]\n"
                    "str q30, [c_ptr6]\n"
                    "mov v30.16b, v24.16b\n"
                    "add c_ptr6, c_ptr6, #0x10\n"
                    "fmla v26.4s, v16.4s, v4.s[0]\n"
                    "str q31, [c_ptr7]\n"
                    "mov v31.16b, v24.16b\n"
                    "add c_ptr7, c_ptr7, #0x10\n"
                    "fmla v24.4s, v16.4s, v0.s[0]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x70\n"
                    "fmla v27.4s, v16.4s, v6.s[0]\n"
                    "add %[biasptr], %[biasptr], %[biasinc]\n"
                    "fmla v28.4s, v16.4s, v8.s[0]\n"
                    "fmla v29.4s, v16.4s, v10.s[0]\n"
                    "fmla v30.4s, v16.4s, v12.s[0]\n"
                    "fmla v31.4s, v16.4s, v14.s[0]\n"
                    "fmla v24.4s, v17.4s, v0.s[1]\n"
                    "fmla v25.4s, v17.4s, v2.s[1]\n"
                    "fmla v26.4s, v17.4s, v4.s[1]\n"
                    "fmla v27.4s, v17.4s, v6.s[1]\n"
                    "fmla v28.4s, v17.4s, v8.s[1]\n"
                    "fmla v29.4s, v17.4s, v10.s[1]\n"
                    "fmla v30.4s, v17.4s, v12.s[1]\n"
                    "fmla v31.4s, v17.4s, v14.s[1]\n"
                    "fmla v24.4s, v18.4s, v0.s[2]\n"
                    "fmla v25.4s, v18.4s, v2.s[2]\n"
                    "fmla v26.4s, v18.4s, v4.s[2]\n"
                    "fmla v27.4s, v18.4s, v6.s[2]\n"
                    "fmla v28.4s, v18.4s, v8.s[2]\n"
                    "fmla v29.4s, v18.4s, v10.s[2]\n"
                    "fmla v30.4s, v18.4s, v12.s[2]\n"
                    "fmla v31.4s, v18.4s, v14.s[2]\n"
                    "fmla v24.4s, v19.4s, v0.s[3]\n"
                    "fmla v25.4s, v19.4s, v2.s[3]\n"
                    "fmla v26.4s, v19.4s, v4.s[3]\n"
                    "fmla v27.4s, v19.4s, v6.s[3]\n"
                    "fmla v28.4s, v19.4s, v8.s[3]\n"
                    "fmla v29.4s, v19.4s, v10.s[3]\n"
                    "fmla v30.4s, v19.4s, v12.s[3]\n"
                    "fmla v31.4s, v19.4s, v14.s[3]\n"
                    "fmla v24.4s, v20.4s, v1.s[0]\n"
                    "fmla v25.4s, v20.4s, v3.s[0]\n"
                    "fmla v26.4s, v20.4s, v5.s[0]\n"
                    "fmla v27.4s, v20.4s, v7.s[0]\n"
                    "fmla v28.4s, v20.4s, v9.s[0]\n"
                    "fmla v29.4s, v20.4s, v11.s[0]\n"
                    "fmla v30.4s, v20.4s, v13.s[0]\n"
                    "fmla v31.4s, v20.4s, v15.s[0]\n"
                    "fmla v24.4s, v21.4s, v1.s[1]\n"
                    "fmla v25.4s, v21.4s, v3.s[1]\n"
                    "fmla v26.4s, v21.4s, v5.s[1]\n"
                    "fmla v27.4s, v21.4s, v7.s[1]\n"
                    "fmla v28.4s, v21.4s, v9.s[1]\n"
                    "fmla v29.4s, v21.4s, v11.s[1]\n"
                    "fmla v30.4s, v21.4s, v13.s[1]\n"
                    "fmla v31.4s, v21.4s, v15.s[1]\n"
                    "fmla v24.4s, v22.4s, v1.s[2]\n"
                    "fmla v25.4s, v22.4s, v3.s[2]\n"
                    "fmla v26.4s, v22.4s, v5.s[2]\n"
                    "fmla v27.4s, v22.4s, v7.s[2]\n"
                    "fmla v28.4s, v22.4s, v9.s[2]\n"
                    "fmla v29.4s, v22.4s, v11.s[2]\n"
                    "fmla v30.4s, v22.4s, v13.s[2]\n"
                    "fmla v31.4s, v22.4s, v15.s[2]\n"
                    "b 5f\n"
                    "2:\n"
                    "ldr q24, [%[biasptr]]\n"
                    "add %[biasptr], %[biasptr], %[biasinc]\n"
                    "mov v25.16b, v24.16b\n"
                    "mov v26.16b, v24.16b\n"
                    "mov v27.16b, v24.16b\n"
                    "mov v28.16b, v24.16b\n"
                    "mov v29.16b, v24.16b\n"
                    "mov v30.16b, v24.16b\n"
                    "mov v31.16b, v24.16b\n"
                    "fmla v24.4s, v16.4s, v0.s[0]\n"
                    "fmla v25.4s, v16.4s, v2.s[0]\n"
                    "fmla v26.4s, v16.4s, v4.s[0]\n"
                    "fmla v27.4s, v16.4s, v6.s[0]\n"
                    "fmla v28.4s, v16.4s, v8.s[0]\n"
                    "fmla v29.4s, v16.4s, v10.s[0]\n"
                    "fmla v30.4s, v16.4s, v12.s[0]\n"
                    "fmla v31.4s, v16.4s, v14.s[0]\n"
                    "fmla v24.4s, v17.4s, v0.s[1]\n"
                    "fmla v25.4s, v17.4s, v2.s[1]\n"
                    "fmla v26.4s, v17.4s, v4.s[1]\n"
                    "fmla v27.4s, v17.4s, v6.s[1]\n"
                    "fmla v28.4s, v17.4s, v8.s[1]\n"
                    "fmla v29.4s, v17.4s, v10.s[1]\n"
                    "fmla v30.4s, v17.4s, v12.s[1]\n"
                    "fmla v31.4s, v17.4s, v14.s[1]\n"
                    "fmla v24.4s, v18.4s, v0.s[2]\n"
                    "fmla v25.4s, v18.4s, v2.s[2]\n"
                    "fmla v26.4s, v18.4s, v4.s[2]\n"
                    "fmla v27.4s, v18.4s, v6.s[2]\n"
                    "fmla v28.4s, v18.4s, v8.s[2]\n"
                    "fmla v29.4s, v18.4s, v10.s[2]\n"
                    "fmla v30.4s, v18.4s, v12.s[2]\n"
                    "fmla v31.4s, v18.4s, v14.s[2]\n"
                    "fmla v24.4s, v19.4s, v0.s[3]\n"
                    "fmla v25.4s, v19.4s, v2.s[3]\n"
                    "fmla v26.4s, v19.4s, v4.s[3]\n"
                    "fmla v27.4s, v19.4s, v6.s[3]\n"
                    "fmla v28.4s, v19.4s, v8.s[3]\n"
                    "fmla v29.4s, v19.4s, v10.s[3]\n"
                    "fmla v30.4s, v19.4s, v12.s[3]\n"
                    "fmla v31.4s, v19.4s, v14.s[3]\n"
                    "fmla v24.4s, v20.4s, v1.s[0]\n"
                    "fmla v25.4s, v20.4s, v3.s[0]\n"
                    "fmla v26.4s, v20.4s, v5.s[0]\n"
                    "fmla v27.4s, v20.4s, v7.s[0]\n"
                    "fmla v28.4s, v20.4s, v9.s[0]\n"
                    "fmla v29.4s, v20.4s, v11.s[0]\n"
                    "fmla v30.4s, v20.4s, v13.s[0]\n"
                    "fmla v31.4s, v20.4s, v15.s[0]\n"
                    "fmla v24.4s, v21.4s, v1.s[1]\n"
                    "fmla v25.4s, v21.4s, v3.s[1]\n"
                    "fmla v26.4s, v21.4s, v5.s[1]\n"
                    "fmla v27.4s, v21.4s, v7.s[1]\n"
                    "fmla v28.4s, v21.4s, v9.s[1]\n"
                    "fmla v29.4s, v21.4s, v11.s[1]\n"
                    "fmla v30.4s, v21.4s, v13.s[1]\n"
                    "fmla v31.4s, v21.4s, v15.s[1]\n"
                    "fmla v24.4s, v22.4s, v1.s[2]\n"
                    "fmla v25.4s, v22.4s, v3.s[2]\n"
                    "fmla v26.4s, v22.4s, v5.s[2]\n"
                    "fmla v27.4s, v22.4s, v7.s[2]\n"
                    "fmla v28.4s, v22.4s, v9.s[2]\n"
                    "fmla v29.4s, v22.4s, v11.s[2]\n"
                    "fmla v30.4s, v22.4s, v13.s[2]\n"
                    "fmla v31.4s, v22.4s, v15.s[2]\n"
                    "5:\n"
                    "ld1r {v22.4s}, [%[minptr]]\n"
                    "ld1r {v23.4s}, [%[maxptr]]\n"
                    "fmax v24.4s, v24.4s, v22.4s\n"
                    "fmax v25.4s, v25.4s, v22.4s\n"
                    "fmax v26.4s, v26.4s, v22.4s\n"
                    "fmax v27.4s, v27.4s, v22.4s\n"
                    "fmin v24.4s, v24.4s, v23.4s\n"
                    "fmin v25.4s, v25.4s, v23.4s\n"
                    "fmin v26.4s, v26.4s, v23.4s\n"
                    "fmin v27.4s, v27.4s, v23.4s\n"
                    "str q24, [%[c_ptr0]]\n"
                    "fmax v28.4s, v28.4s, v22.4s\n"
                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
                    "fmax v29.4s, v29.4s, v22.4s\n"
                    "str q25, [c_ptr1]\n"
                    "fmax v30.4s, v30.4s, v22.4s\n"
                    "fmin v28.4s, v28.4s, v23.4s\n"
                    "fmax v31.4s, v31.4s, v22.4s\n"
                    "str q26, [c_ptr2]\n"
                    "fmin v29.4s, v29.4s, v23.4s\n"
                    "fmin v30.4s, v30.4s, v23.4s\n"
                    "fmin v31.4s, v31.4s, v23.4s\n"
                    "str q27, [c_ptr3]\n"
                    "str q28, [c_ptr4]\n"
                    "str q29, [c_ptr5]\n"
                    "str q30, [c_ptr6]\n"
                    "str q31, [c_ptr7]\n"
                    ".unreq a_ptr1\n"
                    ".unreq a_ptr2\n"
                    ".unreq a_ptr3\n"
                    ".unreq a_ptr4\n"
                    ".unreq a_ptr5\n"
                    ".unreq a_ptr6\n"
                    ".unreq a_ptr7\n"
                    ".unreq c_ptr1\n"
                    ".unreq c_ptr2\n"
                    ".unreq c_ptr3\n"
                    ".unreq c_ptr4\n"
                    ".unreq c_ptr5\n"
                    ".unreq c_ptr6\n"
                    ".unreq c_ptr7\n"
                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [biasptr] "+r" (biasptr)
                    : [lda] "r" (ldab), [ldc] "r" (ldcb), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
                );
                break;
            default:
            case 8:
                __asm __volatile (
                    "a_ptr1 .req X0\n"
                    "a_ptr2 .req X1\n"
                    "a_ptr3 .req X2\n"
                    "a_ptr4 .req X3\n"
                    "a_ptr5 .req X4\n"
                    "a_ptr6 .req X5\n"
                    "a_ptr7 .req X6\n"
                    "c_ptr1 .req X7\n"
                    "c_ptr2 .req X8\n"
                    "c_ptr3 .req X9\n"
                    "c_ptr4 .req X10\n"
                    "c_ptr5 .req X11\n"
                    "c_ptr6 .req X12\n"
                    "c_ptr7 .req X13\n"
                    "add a_ptr1, %[a_ptr0], %[lda]\n"
                    "add c_ptr1, %[c_ptr0], %[ldc]\n"
                    "add a_ptr2, a_ptr1, %[lda]\n"
                    "add c_ptr2, c_ptr1, %[ldc]\n"
                    "add a_ptr3, a_ptr2, %[lda]\n"
                    "add c_ptr3, c_ptr2, %[ldc]\n"
                    "add a_ptr4, a_ptr3, %[lda]\n"
                    "add c_ptr4, c_ptr3, %[ldc]\n"
                    "add a_ptr5, a_ptr4, %[lda]\n"
                    "add c_ptr5, c_ptr4, %[ldc]\n"
                    "add a_ptr6, a_ptr5, %[lda]\n"
                    "add c_ptr6, c_ptr5, %[ldc]\n"
                    "add a_ptr7, a_ptr6, %[lda]\n"
                    "add c_ptr7, c_ptr6, %[ldc]\n"
                    "cbz %[oob_rows], 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr7, %[c_ptr0], #0x0\n"
                    "add a_ptr7, %[a_ptr0], #0x0\n"
                    "b.eq 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr6, %[c_ptr0], #0x0\n"
                    "add a_ptr6, %[a_ptr0], #0x0\n"
                    "b.eq 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr5, %[c_ptr0], #0x0\n"
                    "add a_ptr5, %[a_ptr0], #0x0\n"
                    "b.eq 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr4, %[c_ptr0], #0x0\n"
                    "add a_ptr4, %[a_ptr0], #0x0\n"
                    "b.eq 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr3, %[c_ptr0], #0x0\n"
                    "add a_ptr3, %[a_ptr0], #0x0\n"
                    "b.eq 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr2, %[c_ptr0], #0x0\n"
                    "add a_ptr2, %[a_ptr0], #0x0\n"
                    "b.eq 1f\n"
                    "subs %[oob_rows], %[oob_rows], #0x1\n"
                    "add c_ptr1, %[c_ptr0], #0x0\n"
                    "add a_ptr1, %[a_ptr0], #0x0\n"
                    "1:\n"
                    "ldr q0, [%[a_ptr0]], #0x10\n"
                    "ldr q2, [a_ptr1], #0x10\n"
                    "ldr q4, [a_ptr2], #0x10\n"
                    "ldr q6, [a_ptr3], #0x10\n"
                    "ldr q8, [a_ptr4], #0x10\n"
                    "ldr q10, [a_ptr5], #0x10\n"
                    "ldr q12, [a_ptr6], #0x10\n"
                    "ldr q14, [a_ptr7], #0x10\n"
                    "ldr q1, [%[a_ptr0]]\n"
                    "ldr q3, [a_ptr1]\n"
                    "ldr q5, [a_ptr2]\n"
                    "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
                    "ldr q7, [a_ptr3]\n"
                    "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
                    "ldr q9, [a_ptr4]\n"
                    "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
                    "ldr q11, [a_ptr5]\n"
                    "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
                    "ldr q13, [a_ptr6]\n"
                    "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
                    "ldr q15, [a_ptr7]\n"
                    "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
                    "ldr q16, [%[b_ptr0]]\n"
                    "ldr q17, [%[b_ptr0], #0x10]\n"
                    "ldr q18, [%[b_ptr0], #0x20]\n"
                    "ldr q19, [%[b_ptr0], #0x30]\n"
                    "ldr q20, [%[b_ptr0], #0x40]\n"
                    "ldr q21, [%[b_ptr0], #0x50]\n"
                    "ldr q22, [%[b_ptr0], #0x60]\n"
                    "ldr q23, [%[b_ptr0], #0x70]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                    "cbz %[loops], 2f\n"
                    "ldr q24, [%[biasptr]]\n"
                    "add %[biasptr], %[biasptr], %[biasinc]\n"
                    "subs %[loops], %[loops], #0x1\n"
                    "mov v25.16b, v24.16b\n"
                    "mov v26.16b, v24.16b\n"
                    "mov v27.16b, v24.16b\n"
                    "mov v28.16b, v24.16b\n"
                    "mov v29.16b, v24.16b\n"
                    "mov v30.16b, v24.16b\n"
                    "mov v31.16b, v24.16b\n"
                    "fmla v24.4s, v16.4s, v0.s[0]\n"
                    "fmla v25.4s, v16.4s, v2.s[0]\n"
                    "fmla v26.4s, v16.4s, v4.s[0]\n"
                    "fmla v27.4s, v16.4s, v6.s[0]\n"
                    "fmla v28.4s, v16.4s, v8.s[0]\n"
                    "fmla v29.4s, v16.4s, v10.s[0]\n"
                    "fmla v30.4s, v16.4s, v12.s[0]\n"
                    "fmla v31.4s, v16.4s, v14.s[0]\n"
                    "fmla v24.4s, v17.4s, v0.s[1]\n"
                    "fmla v25.4s, v17.4s, v2.s[1]\n"
                    "fmla v26.4s, v17.4s, v4.s[1]\n"
                    "fmla v27.4s, v17.4s, v6.s[1]\n"
                    "fmla v28.4s, v17.4s, v8.s[1]\n"
                    "fmla v29.4s, v17.4s, v10.s[1]\n"
                    "fmla v30.4s, v17.4s, v12.s[1]\n"
                    "fmla v31.4s, v17.4s, v14.s[1]\n"
                    "fmla v24.4s, v18.4s, v0.s[2]\n"
                    "fmla v25.4s, v18.4s, v2.s[2]\n"
                    "fmla v26.4s, v18.4s, v4.s[2]\n"
                    "fmla v27.4s, v18.4s, v6.s[2]\n"
                    "fmla v28.4s, v18.4s, v8.s[2]\n"
                    "fmla v29.4s, v18.4s, v10.s[2]\n"
                    "fmla v30.4s, v18.4s, v12.s[2]\n"
                    "fmla v31.4s, v18.4s, v14.s[2]\n"
                    "fmla v24.4s, v19.4s, v0.s[3]\n"
                    "fmla v25.4s, v19.4s, v2.s[3]\n"
                    "fmla v26.4s, v19.4s, v4.s[3]\n"
                    "fmla v27.4s, v19.4s, v6.s[3]\n"
                    "fmla v28.4s, v19.4s, v8.s[3]\n"
                    "fmla v29.4s, v19.4s, v10.s[3]\n"
                    "fmla v30.4s, v19.4s, v12.s[3]\n"
                    "fmla v31.4s, v19.4s, v14.s[3]\n"
                    "fmla v24.4s, v20.4s, v1.s[0]\n"
                    "fmla v25.4s, v20.4s, v3.s[0]\n"
                    "fmla v26.4s, v20.4s, v5.s[0]\n"
                    "fmla v27.4s, v20.4s, v7.s[0]\n"
                    "fmla v28.4s, v20.4s, v9.s[0]\n"
                    "fmla v29.4s, v20.4s, v11.s[0]\n"
                    "fmla v30.4s, v20.4s, v13.s[0]\n"
                    "fmla v31.4s, v20.4s, v15.s[0]\n"
                    "fmla v24.4s, v21.4s, v1.s[1]\n"
                    "fmla v25.4s, v21.4s, v3.s[1]\n"
                    "fmla v26.4s, v21.4s, v5.s[1]\n"
                    "fmla v27.4s, v21.4s, v7.s[1]\n"
                    "fmla v28.4s, v21.4s, v9.s[1]\n"
                    "fmla v29.4s, v21.4s, v11.s[1]\n"
                    "fmla v30.4s, v21.4s, v13.s[1]\n"
                    "fmla v31.4s, v21.4s, v15.s[1]\n"
                    "fmla v24.4s, v22.4s, v1.s[2]\n"
                    "fmla v25.4s, v22.4s, v3.s[2]\n"
                    "fmla v26.4s, v22.4s, v5.s[2]\n"
                    "fmla v27.4s, v22.4s, v7.s[2]\n"
                    "fmla v28.4s, v22.4s, v9.s[2]\n"
                    "fmla v29.4s, v22.4s, v11.s[2]\n"
                    "fmla v30.4s, v22.4s, v13.s[2]\n"
                    "fmla v31.4s, v22.4s, v15.s[2]\n"
                    "fmla v24.4s, v23.4s, v1.s[3]\n"
                    "fmla v25.4s, v23.4s, v3.s[3]\n"
                    "fmla v26.4s, v23.4s, v5.s[3]\n"
                    "fmla v27.4s, v23.4s, v7.s[3]\n"
                    "fmla v28.4s, v23.4s, v9.s[3]\n"
                    "fmla v29.4s, v23.4s, v11.s[3]\n"
                    "fmla v30.4s, v23.4s, v13.s[3]\n"
                    "fmla v31.4s, v23.4s, v15.s[3]\n"
                    "b.eq 3f\n"
                    "4:\n"
                    "ld1r {v22.4s}, [%[minptr]]\n"
                    "subs %[loops], %[loops], #0x1\n"
                    "ld1r {v23.4s}, [%[maxptr]]\n"
                    "ldr q16, [%[b_ptr0]]\n"
                    "fmax v24.4s, v24.4s, v22.4s\n"
                    "ldr q17, [%[b_ptr0], #0x10]\n"
                    "fmax v25.4s, v25.4s, v22.4s\n"
                    "ldr q18, [%[b_ptr0], #0x20]\n"
                    "fmax v26.4s, v26.4s, v22.4s\n"
                    "ldr q19, [%[b_ptr0], #0x30]\n"
                    "fmax v27.4s, v27.4s, v22.4s\n"
                    "ldr q20, [%[b_ptr0], #0x40]\n"
                    "fmin v24.4s, v24.4s, v23.4s\n"
                    "ldr q21, [%[b_ptr0], #0x50]\n"
                    "fmin v25.4s, v25.4s, v23.4s\n"
                    "fmin v26.4s, v26.4s, v23.4s\n"
                    "fmin v27.4s, v27.4s, v23.4s\n"
                    "str q24, [%[c_ptr0]]\n"
                    "fmax v28.4s, v28.4s, v22.4s\n"
                    "ldr q24, [%[biasptr]]\n"
                    "fmax v29.4s, v29.4s, v22.4s\n"
                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
                    "fmax v30.4s, v30.4s, v22.4s\n"
                    "str q25, [c_ptr1]\n"
                    "fmin v28.4s, v28.4s, v23.4s\n"
                    "add c_ptr1, c_ptr1, #0x10\n"
                    "fmin v29.4s, v29.4s, v23.4s\n"
                    "str q26, [c_ptr2]\n"
                    "fmin v30.4s, v30.4s, v23.4s\n"
                    "add c_ptr2, c_ptr2, #0x10\n"
                    "fmax v31.4s, v31.4s, v22.4s\n"
                    "str q27, [c_ptr3]\n"
                    "mov v25.16b, v24.16b\n"
                    "ldr q22, [%[b_ptr0], #0x60]\n"
                    "mov v26.16b, v24.16b\n"
                    "add c_ptr3, c_ptr3, #0x10\n"
                    "fmin v31.4s, v31.4s, v23.4s\n"
                    "str q28, [c_ptr4]\n"
                    "mov v27.16b, v24.16b\n"
                    "ldr q23, [%[b_ptr0], #0x70]\n"
                    "mov v28.16b, v24.16b\n"
                    "add c_ptr4, c_ptr4, #0x10\n"
                    "fmla v25.4s, v16.4s, v2.s[0]\n"
                    "str q29, [c_ptr5]\n"
                    "mov v29.16b, v24.16b\n"
                    "add c_ptr5, c_ptr5, #0x10\n"
                    "fmla v26.4s, v16.4s, v4.s[0]\n"
                    "str q30, [c_ptr6]\n"
                    "mov v30.16b, v24.16b\n"
                    "add c_ptr6, c_ptr6, #0x10\n"
                    "fmla v27.4s, v16.4s, v6.s[0]\n"
                    "str q31, [c_ptr7]\n"
                    "mov v31.16b, v24.16b\n"
                    "add c_ptr7, c_ptr7, #0x10\n"
                    "fmla v24.4s, v16.4s, v0.s[0]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                    "fmla v28.4s, v16.4s, v8.s[0]\n"
                    "add %[biasptr], %[biasptr], %[biasinc]\n"
                    "fmla v29.4s, v16.4s, v10.s[0]\n"
                    "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
                    "fmla v30.4s, v16.4s, v12.s[0]\n"
                    "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
                    "fmla v31.4s, v16.4s, v14.s[0]\n"
                    "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
                    "fmla v24.4s, v17.4s, v0.s[1]\n"
                    "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
                    "fmla v25.4s, v17.4s, v2.s[1]\n"
                    "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
                    "fmla v26.4s, v17.4s, v4.s[1]\n"
                    "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
                    "fmla v27.4s, v17.4s, v6.s[1]\n"
                    "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
                    "fmla v28.4s, v17.4s, v8.s[1]\n"
                    "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
                    "fmla v29.4s, v17.4s, v10.s[1]\n"
                    "fmla v30.4s, v17.4s, v12.s[1]\n"
                    "fmla v31.4s, v17.4s, v14.s[1]\n"
                    "fmla v24.4s, v18.4s, v0.s[2]\n"
                    "fmla v25.4s, v18.4s, v2.s[2]\n"
                    "fmla v26.4s, v18.4s, v4.s[2]\n"
                    "fmla v27.4s, v18.4s, v6.s[2]\n"
                    "fmla v28.4s, v18.4s, v8.s[2]\n"
                    "fmla v29.4s, v18.4s, v10.s[2]\n"
                    "fmla v30.4s, v18.4s, v12.s[2]\n"
                    "fmla v31.4s, v18.4s, v14.s[2]\n"
                    "fmla v24.4s, v19.4s, v0.s[3]\n"
                    "fmla v25.4s, v19.4s, v2.s[3]\n"
                    "fmla v26.4s, v19.4s, v4.s[3]\n"
                    "fmla v27.4s, v19.4s, v6.s[3]\n"
                    "fmla v28.4s, v19.4s, v8.s[3]\n"
                    "fmla v29.4s, v19.4s, v10.s[3]\n"
                    "fmla v30.4s, v19.4s, v12.s[3]\n"
                    "fmla v31.4s, v19.4s, v14.s[3]\n"
                    "fmla v24.4s, v20.4s, v1.s[0]\n"
                    "fmla v25.4s, v20.4s, v3.s[0]\n"
                    "fmla v26.4s, v20.4s, v5.s[0]\n"
                    "fmla v27.4s, v20.4s, v7.s[0]\n"
                    "fmla v28.4s, v20.4s, v9.s[0]\n"
                    "fmla v29.4s, v20.4s, v11.s[0]\n"
                    "fmla v30.4s, v20.4s, v13.s[0]\n"
                    "fmla v31.4s, v20.4s, v15.s[0]\n"
                    "fmla v24.4s, v21.4s, v1.s[1]\n"
                    "fmla v25.4s, v21.4s, v3.s[1]\n"
                    "fmla v26.4s, v21.4s, v5.s[1]\n"
                    "fmla v27.4s, v21.4s, v7.s[1]\n"
                    "fmla v28.4s, v21.4s, v9.s[1]\n"
                    "fmla v29.4s, v21.4s, v11.s[1]\n"
                    "fmla v30.4s, v21.4s, v13.s[1]\n"
                    "fmla v31.4s, v21.4s, v15.s[1]\n"
                    "fmla v24.4s, v22.4s, v1.s[2]\n"
                    "fmla v25.4s, v22.4s, v3.s[2]\n"
                    "fmla v26.4s, v22.4s, v5.s[2]\n"
                    "fmla v27.4s, v22.4s, v7.s[2]\n"
                    "fmla v28.4s, v22.4s, v9.s[2]\n"
                    "fmla v29.4s, v22.4s, v11.s[2]\n"
                    "fmla v30.4s, v22.4s, v13.s[2]\n"
                    "fmla v31.4s, v22.4s, v15.s[2]\n"
                    "fmla v24.4s, v23.4s, v1.s[3]\n"
                    "fmla v25.4s, v23.4s, v3.s[3]\n"
                    "fmla v26.4s, v23.4s, v5.s[3]\n"
                    "fmla v27.4s, v23.4s, v7.s[3]\n"
                    "fmla v28.4s, v23.4s, v9.s[3]\n"
                    "fmla v29.4s, v23.4s, v11.s[3]\n"
                    "fmla v30.4s, v23.4s, v13.s[3]\n"
                    "fmla v31.4s, v23.4s, v15.s[3]\n"
                    "b.ne 4b\n"
                    "3:\n"
                    "ld1r {v22.4s}, [%[minptr]]\n"
                    "ld1r {v23.4s}, [%[maxptr]]\n"
                    "ldr q16, [%[b_ptr0]]\n"
                    "ldr q17, [%[b_ptr0], #0x10]\n"
                    "fmax v24.4s, v24.4s, v22.4s\n"
                    "ldr q18, [%[b_ptr0], #0x20]\n"
                    "fmax v25.4s, v25.4s, v22.4s\n"
                    "ldr q19, [%[b_ptr0], #0x30]\n"
                    "fmax v26.4s, v26.4s, v22.4s\n"
                    "ldr q20, [%[b_ptr0], #0x40]\n"
                    "fmax v27.4s, v27.4s, v22.4s\n"
                    "ldr q21, [%[b_ptr0], #0x50]\n"
                    "fmin v24.4s, v24.4s, v23.4s\n"
                    "fmin v25.4s, v25.4s, v23.4s\n"
                    "fmin v26.4s, v26.4s, v23.4s\n"
                    "fmin v27.4s, v27.4s, v23.4s\n"
                    "str q24, [%[c_ptr0]]\n"
                    "fmax v28.4s, v28.4s, v22.4s\n"
                    "ldr q24, [%[biasptr]]\n"
                    "fmax v29.4s, v29.4s, v22.4s\n"
                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
                    "fmax v30.4s, v30.4s, v22.4s\n"
                    "str q25, [c_ptr1]\n"
                    "fmin v28.4s, v28.4s, v23.4s\n"
                    "add c_ptr1, c_ptr1, #0x10\n"
                    "fmin v29.4s, v29.4s, v23.4s\n"
                    "str q26, [c_ptr2]\n"
                    "fmin v30.4s, v30.4s, v23.4s\n"
                    "add c_ptr2, c_ptr2, #0x10\n"
                    "fmax v31.4s, v31.4s, v22.4s\n"
                    "str q27, [c_ptr3]\n"
                    "mov v25.16b, v24.16b\n"
                    "ldr q22, [%[b_ptr0], #0x60]\n"
                    "mov v26.16b, v24.16b\n"
                    "add c_ptr3, c_ptr3, #0x10\n"
                    "fmin v31.4s, v31.4s, v23.4s\n"
                    "str q28, [c_ptr4]\n"
                    "mov v27.16b, v24.16b\n"
                    "ldr q23, [%[b_ptr0], #0x70]\n"
                    "mov v28.16b, v24.16b\n"
                    "add c_ptr4, c_ptr4, #0x10\n"
                    "fmla v25.4s, v16.4s, v2.s[0]\n"
                    "str q29, [c_ptr5]\n"
                    "mov v29.16b, v24.16b\n"
                    "add c_ptr5, c_ptr5, #0x10\n"
                    "fmla v26.4s, v16.4s, v4.s[0]\n"
                    "str q30, [c_ptr6]\n"
                    "mov v30.16b, v24.16b\n"
                    "add c_ptr6, c_ptr6, #0x10\n"
                    "fmla v27.4s, v16.4s, v6.s[0]\n"
                    "str q31, [c_ptr7]\n"
                    "mov v31.16b, v24.16b\n"
                    "add c_ptr7, c_ptr7, #0x10\n"
                    "fmla v24.4s, v16.4s, v0.s[0]\n"
                    "add %[b_ptr0], %[b_ptr0], #0x80\n"
                    "fmla v28.4s, v16.4s, v8.s[0]\n"
                    "add %[biasptr], %[biasptr], %[biasinc]\n"
                    "fmla v29.4s, v16.4s, v10.s[0]\n"
                    "fmla v30.4s, v16.4s, v12.s[0]\n"
                    "fmla v31.4s, v16.4s, v14.s[0]\n"
                    "fmla v24.4s, v17.4s, v0.s[1]\n"
                    "fmla v25.4s, v17.4s, v2.s[1]\n"
                    "fmla v26.4s, v17.4s, v4.s[1]\n"
                    "fmla v27.4s, v17.4s, v6.s[1]\n"
                    "fmla v28.4s, v17.4s, v8.s[1]\n"
                    "fmla v29.4s, v17.4s, v10.s[1]\n"
                    "fmla v30.4s, v17.4s, v12.s[1]\n"
                    "fmla v31.4s, v17.4s, v14.s[1]\n"
                    "fmla v24.4s, v18.4s, v0.s[2]\n"
                    "fmla v25.4s, v18.4s, v2.s[2]\n"
                    "fmla v26.4s, v18.4s, v4.s[2]\n"
                    "fmla v27.4s, v18.4s, v6.s[2]\n"
                    "fmla v28.4s, v18.4s, v8.s[2]\n"
                    "fmla v29.4s, v18.4s, v10.s[2]\n"
                    "fmla v30.4s, v18.4s, v12.s[2]\n"
                    "fmla v31.4s, v18.4s, v14.s[2]\n"
                    "fmla v24.4s, v19.4s, v0.s[3]\n"
                    "fmla v25.4s, v19.4s, v2.s[3]\n"
                    "fmla v26.4s, v19.4s, v4.s[3]\n"
                    "fmla v27.4s, v19.4s, v6.s[3]\n"
                    "fmla v28.4s, v19.4s, v8.s[3]\n"
                    "fmla v29.4s, v19.4s, v10.s[3]\n"
                    "fmla v30.4s, v19.4s, v12.s[3]\n"
                    "fmla v31.4s, v19.4s, v14.s[3]\n"
                    "fmla v24.4s, v20.4s, v1.s[0]\n"
                    "fmla v25.4s, v20.4s, v3.s[0]\n"
                    "fmla v26.4s, v20.4s, v5.s[0]\n"
                    "fmla v27.4s, v20.4s, v7.s[0]\n"
                    "fmla v28.4s, v20.4s, v9.s[0]\n"
                    "fmla v29.4s, v20.4s, v11.s[0]\n"
                    "fmla v30.4s, v20.4s, v13.s[0]\n"
                    "fmla v31.4s, v20.4s, v15.s[0]\n"
                    "fmla v24.4s, v21.4s, v1.s[1]\n"
                    "fmla v25.4s, v21.4s, v3.s[1]\n"
                    "fmla v26.4s, v21.4s, v5.s[1]\n"
                    "fmla v27.4s, v21.4s, v7.s[1]\n"
                    "fmla v28.4s, v21.4s, v9.s[1]\n"
                    "fmla v29.4s, v21.4s, v11.s[1]\n"
                    "fmla v30.4s, v21.4s, v13.s[1]\n"
                    "fmla v31.4s, v21.4s, v15.s[1]\n"
                    "fmla v24.4s, v22.4s, v1.s[2]\n"
                    "fmla v25.4s, v22.4s, v3.s[2]\n"
                    "fmla v26.4s, v22.4s, v5.s[2]\n"
                    "fmla v27.4s, v22.4s, v7.s[2]\n"
                    "fmla v28.4s, v22.4s, v9.s[2]\n"
                    "fmla v29.4s, v22.4s, v11.s[2]\n"
                    "fmla v30.4s, v22.4s, v13.s[2]\n"
                    "fmla v31.4s, v22.4s, v15.s[2]\n"
                    "fmla v24.4s, v23.4s, v1.s[3]\n"
                    "fmla v25.4s, v23.4s, v3.s[3]\n"
                    "fmla v26.4s, v23.4s, v5.s[3]\n"
                    "fmla v27.4s, v23.4s, v7.s[3]\n"
                    "fmla v28.4s, v23.4s, v9.s[3]\n"
                    "fmla v29.4s, v23.4s, v11.s[3]\n"
                    "fmla v30.4s, v23.4s, v13.s[3]\n"
                    "fmla v31.4s, v23.4s, v15.s[3]\n"
                    "b 5f\n"
                    "2:\n"
                    "ldr q24, [%[biasptr]]\n"
                    "add %[biasptr], %[biasptr], %[biasinc]\n"
                    "mov v25.16b, v24.16b\n"
                    "mov v26.16b, v24.16b\n"
                    "mov v27.16b, v24.16b\n"
                    "mov v28.16b, v24.16b\n"
                    "mov v29.16b, v24.16b\n"
                    "mov v30.16b, v24.16b\n"
                    "mov v31.16b, v24.16b\n"
                    "fmla v24.4s, v16.4s, v0.s[0]\n"
                    "fmla v25.4s, v16.4s, v2.s[0]\n"
                    "fmla v26.4s, v16.4s, v4.s[0]\n"
                    "fmla v27.4s, v16.4s, v6.s[0]\n"
                    "fmla v28.4s, v16.4s, v8.s[0]\n"
                    "fmla v29.4s, v16.4s, v10.s[0]\n"
                    "fmla v30.4s, v16.4s, v12.s[0]\n"
                    "fmla v31.4s, v16.4s, v14.s[0]\n"
                    "fmla v24.4s, v17.4s, v0.s[1]\n"
                    "fmla v25.4s, v17.4s, v2.s[1]\n"
                    "fmla v26.4s, v17.4s, v4.s[1]\n"
                    "fmla v27.4s, v17.4s, v6.s[1]\n"
                    "fmla v28.4s, v17.4s, v8.s[1]\n"
                    "fmla v29.4s, v17.4s, v10.s[1]\n"
                    "fmla v30.4s, v17.4s, v12.s[1]\n"
                    "fmla v31.4s, v17.4s, v14.s[1]\n"
                    "fmla v24.4s, v18.4s, v0.s[2]\n"
                    "fmla v25.4s, v18.4s, v2.s[2]\n"
                    "fmla v26.4s, v18.4s, v4.s[2]\n"
                    "fmla v27.4s, v18.4s, v6.s[2]\n"
                    "fmla v28.4s, v18.4s, v8.s[2]\n"
                    "fmla v29.4s, v18.4s, v10.s[2]\n"
                    "fmla v30.4s, v18.4s, v12.s[2]\n"
                    "fmla v31.4s, v18.4s, v14.s[2]\n"
                    "fmla v24.4s, v19.4s, v0.s[3]\n"
                    "fmla v25.4s, v19.4s, v2.s[3]\n"
                    "fmla v26.4s, v19.4s, v4.s[3]\n"
                    "fmla v27.4s, v19.4s, v6.s[3]\n"
                    "fmla v28.4s, v19.4s, v8.s[3]\n"
                    "fmla v29.4s, v19.4s, v10.s[3]\n"
                    "fmla v30.4s, v19.4s, v12.s[3]\n"
                    "fmla v31.4s, v19.4s, v14.s[3]\n"
                    "fmla v24.4s, v20.4s, v1.s[0]\n"
                    "fmla v25.4s, v20.4s, v3.s[0]\n"
                    "fmla v26.4s, v20.4s, v5.s[0]\n"
                    "fmla v27.4s, v20.4s, v7.s[0]\n"
                    "fmla v28.4s, v20.4s, v9.s[0]\n"
                    "fmla v29.4s, v20.4s, v11.s[0]\n"
                    "fmla v30.4s, v20.4s, v13.s[0]\n"
                    "fmla v31.4s, v20.4s, v15.s[0]\n"
                    "fmla v24.4s, v21.4s, v1.s[1]\n"
                    "fmla v25.4s, v21.4s, v3.s[1]\n"
                    "fmla v26.4s, v21.4s, v5.s[1]\n"
                    "fmla v27.4s, v21.4s, v7.s[1]\n"
                    "fmla v28.4s, v21.4s, v9.s[1]\n"
                    "fmla v29.4s, v21.4s, v11.s[1]\n"
                    "fmla v30.4s, v21.4s, v13.s[1]\n"
                    "fmla v31.4s, v21.4s, v15.s[1]\n"
                    "fmla v24.4s, v22.4s, v1.s[2]\n"
                    "fmla v25.4s, v22.4s, v3.s[2]\n"
                    "fmla v26.4s, v22.4s, v5.s[2]\n"
                    "fmla v27.4s, v22.4s, v7.s[2]\n"
                    "fmla v28.4s, v22.4s, v9.s[2]\n"
                    "fmla v29.4s, v22.4s, v11.s[2]\n"
                    "fmla v30.4s, v22.4s, v13.s[2]\n"
                    "fmla v31.4s, v22.4s, v15.s[2]\n"
                    "fmla v24.4s, v23.4s, v1.s[3]\n"
                    "fmla v25.4s, v23.4s, v3.s[3]\n"
                    "fmla v26.4s, v23.4s, v5.s[3]\n"
                    "fmla v27.4s, v23.4s, v7.s[3]\n"
                    "fmla v28.4s, v23.4s, v9.s[3]\n"
                    "fmla v29.4s, v23.4s, v11.s[3]\n"
                    "fmla v30.4s, v23.4s, v13.s[3]\n"
                    "fmla v31.4s, v23.4s, v15.s[3]\n"
                    "5:\n"
                    "ld1r {v22.4s}, [%[minptr]]\n"
                    "ld1r {v23.4s}, [%[maxptr]]\n"
                    "fmax v24.4s, v24.4s, v22.4s\n"
                    "fmax v25.4s, v25.4s, v22.4s\n"
                    "fmax v26.4s, v26.4s, v22.4s\n"
                    "fmax v27.4s, v27.4s, v22.4s\n"
                    "fmin v24.4s, v24.4s, v23.4s\n"
                    "fmin v25.4s, v25.4s, v23.4s\n"
                    "fmin v26.4s, v26.4s, v23.4s\n"
                    "fmin v27.4s, v27.4s, v23.4s\n"
                    "str q24, [%[c_ptr0]]\n"
                    "fmax v28.4s, v28.4s, v22.4s\n"
                    "add %[c_ptr0], %[c_ptr0], #0x10\n"
                    "fmax v29.4s, v29.4s, v22.4s\n"
                    "str q25, [c_ptr1]\n"
                    "fmax v30.4s, v30.4s, v22.4s\n"
                    "fmin v28.4s, v28.4s, v23.4s\n"
                    "fmax v31.4s, v31.4s, v22.4s\n"
                    "str q26, [c_ptr2]\n"
                    "fmin v29.4s, v29.4s, v23.4s\n"
                    "fmin v30.4s, v30.4s, v23.4s\n"
                    "fmin v31.4s, v31.4s, v23.4s\n"
                    "str q27, [c_ptr3]\n"
                    "str q28, [c_ptr4]\n"
                    "str q29, [c_ptr5]\n"
                    "str q30, [c_ptr6]\n"
                    "str q31, [c_ptr7]\n"
                    ".unreq a_ptr1\n"
                    ".unreq a_ptr2\n"
                    ".unreq a_ptr3\n"
                    ".unreq a_ptr4\n"
                    ".unreq a_ptr5\n"
                    ".unreq a_ptr6\n"
                    ".unreq a_ptr7\n"
                    ".unreq c_ptr1\n"
                    ".unreq c_ptr2\n"
                    ".unreq c_ptr3\n"
                    ".unreq c_ptr4\n"
                    ".unreq c_ptr5\n"
                    ".unreq c_ptr6\n"
                    ".unreq c_ptr7\n"
                    : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [oob_rows] "+r" (oob_rows), [biasptr] "+r" (biasptr)
                    : [lda] "r" (ldab), [ldc] "r" (ldcb), [biasinc] "r" (biasinc), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
                    : "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
                );
                break;
        }
    }
}

} // namespace arm_gemm

#endif // __aarch64__
